treat 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +3 -0
- data/TODO +14 -26
- data/bin/INFO +1 -1
- data/lib/treat/buildable.rb +10 -11
- data/lib/treat/categories.rb +8 -6
- data/lib/treat/category.rb +7 -2
- data/lib/treat/delegatable.rb +64 -56
- data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
- data/lib/treat/detectors/language/language_detector.rb +2 -1
- data/lib/treat/detectors/language/what_language.rb +2 -2
- data/lib/treat/detectors.rb +3 -0
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/entities.rb +9 -10
- data/lib/treat/exception.rb +3 -1
- data/lib/treat/extractors/named_entity/abner.rb +1 -1
- data/lib/treat/extractors/named_entity/stanford.rb +2 -2
- data/lib/treat/extractors/time/chronic.rb +2 -2
- data/lib/treat/extractors/time/nickel.rb +2 -2
- data/lib/treat/extractors/topic_words/lda.rb +2 -2
- data/lib/treat/extractors.rb +12 -9
- data/lib/treat/feature.rb +6 -1
- data/lib/treat/formatters/cleaners/html.rb +1 -1
- data/lib/treat/formatters.rb +8 -8
- data/lib/treat/group.rb +11 -10
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
- data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
- data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
- data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
- data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
- data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
- data/lib/treat/inflectors.rb +8 -21
- data/lib/treat/kernel.rb +120 -0
- data/lib/treat/languages/arabic.rb +14 -0
- data/lib/treat/languages/categories.rb +5 -0
- data/lib/treat/languages/chinese.rb +12 -0
- data/lib/treat/languages/english/categories.rb +23 -0
- data/lib/treat/{resources → languages/english}/tags.rb +127 -184
- data/lib/treat/languages/english.rb +33 -0
- data/lib/treat/languages/french.rb +17 -0
- data/lib/treat/languages/german.rb +17 -0
- data/lib/treat/languages/italian.rb +14 -0
- data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
- data/lib/treat/languages/xinhua.rb +12 -0
- data/lib/treat/languages.rb +91 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
- data/lib/treat/lexicalizers/tag/brill.rb +2 -1
- data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
- data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
- data/lib/treat/lexicalizers.rb +1 -1
- data/lib/treat/object.rb +6 -0
- data/lib/treat/processors/parsers/enju.rb +3 -2
- data/lib/treat/processors/parsers/stanford.rb +15 -12
- data/lib/treat/processors/segmenters/punkt.rb +1 -1
- data/lib/treat/processors/segmenters/stanford.rb +7 -5
- data/lib/treat/processors/segmenters/tactful.rb +1 -1
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
- data/lib/treat/processors/tokenizers/stanford.rb +7 -5
- data/lib/treat/visitable.rb +2 -1
- data/lib/treat.rb +105 -54
- data/test/tc_entity.rb +5 -0
- data/test/tc_resources.rb +5 -5
- data/test/tc_treat.rb +1 -2
- data/test/tests.rb +2 -1
- metadata +63 -64
- data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
- data/lib/treat/resources/categories.rb +0 -18
- data/lib/treat/resources/delegates.rb +0 -96
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +0 -8
- data/lib/treat/resources/formats.rb +0 -23
- data/lib/treat/resources/languages.rb +0 -86
- data/lib/treat/resources.rb +0 -10
- data/lib/treat/utilities.rb +0 -127
data/lib/treat/group.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
module Treat
|
2
2
|
module Group
|
3
|
+
# Modify the extended class.
|
3
4
|
def self.extended(group)
|
4
5
|
group.module_eval do
|
5
6
|
class << self
|
@@ -75,22 +76,22 @@ module Treat
|
|
75
76
|
end
|
76
77
|
@@list[mod]
|
77
78
|
end
|
78
|
-
#
|
79
|
-
|
80
|
-
|
81
|
-
end
|
79
|
+
# Get constants in this module, excluding those
|
80
|
+
# defined by parent modules.
|
81
|
+
def const_get(const); super(const, false); end
|
82
82
|
# Autoload the algorithms.
|
83
83
|
def const_missing(const)
|
84
84
|
bits = self.ancestors[0].to_s.split('::')
|
85
85
|
bits.collect! { |bit| ucc(bit) }
|
86
|
-
file = bits.join('/') + "/#{ucc(const)}"
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
86
|
+
file = bits.join('/') + "/#{ucc(const)}"
|
87
|
+
if not File.readable?("#{Treat.lib}/#{file}.rb")
|
88
|
+
raise Treat::Exception,
|
89
|
+
"File '#{file}.rb' corresponding to requested delegate "+
|
90
|
+
"#{self}::#{const} does not exist."
|
91
|
+
else
|
91
92
|
require file
|
92
93
|
const_get(const)
|
93
|
-
|
94
|
+
end
|
94
95
|
end
|
95
96
|
end
|
96
97
|
end
|
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
module Inflectors
|
3
3
|
module CardinalWords
|
4
4
|
class Linguistics
|
5
|
-
|
5
|
+
silence_warnings { require 'linguistics' }
|
6
6
|
#
|
7
7
|
# Options:
|
8
8
|
#
|
@@ -32,12 +32,12 @@ module Treat
|
|
32
32
|
begin
|
33
33
|
l = entity.language.to_s.upcase
|
34
34
|
delegate = nil
|
35
|
-
|
35
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
36
36
|
rescue RuntimeError
|
37
37
|
raise "Ruby Linguistics does not have a module " +
|
38
38
|
" installed for the #{entity.language} language."
|
39
39
|
end
|
40
|
-
|
40
|
+
silence_warnings { delegate.numwords(entity.to_s, options) }
|
41
41
|
end
|
42
42
|
end
|
43
43
|
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
3
|
+
module Conjugations
|
4
4
|
class Linguistics
|
5
|
-
|
5
|
+
silence_warnings { require 'linguistics' }
|
6
6
|
def self.conjugate(entity, parameters)
|
7
7
|
begin
|
8
8
|
l = entity.language.to_s.upcase
|
9
9
|
delegate = nil
|
10
|
-
|
10
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
11
|
rescue RuntimeError
|
12
12
|
raise "Ruby Linguistics does not have a module " +
|
13
13
|
" installed for the #{entity.language} language."
|
14
14
|
end
|
15
15
|
if parameters[:mode] == :infinitive
|
16
|
-
|
16
|
+
silence_warnings { delegate.infinitive(entity.to_s) }
|
17
17
|
elsif parameters[:mode] == :participle && parameters[:tense] == :present
|
18
|
-
|
18
|
+
silence_warnings { delegate.present_participle(entity.to_s) }
|
19
19
|
elsif parameters[:count] == :plural && parameters.size == 1
|
20
|
-
|
20
|
+
silence_warnings { delegate.plural_verb(entity.to_s) }
|
21
21
|
else
|
22
22
|
raise Treat::Exception,
|
23
23
|
'This combination of modes, tenses, persons ' +
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
4
|
-
|
3
|
+
module Declensions
|
4
|
+
silence_warnings { require 'linguistics' }
|
5
5
|
# Obtain word declensions in English using the
|
6
6
|
# ruby 'linguistics' gem.
|
7
7
|
class Linguistics
|
@@ -9,7 +9,7 @@ module Treat
|
|
9
9
|
begin
|
10
10
|
l = entity.language.to_s.upcase
|
11
11
|
delegate = nil
|
12
|
-
|
12
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
13
13
|
rescue RuntimeError
|
14
14
|
raise "Ruby Linguistics does not have a module " +
|
15
15
|
" installed for the #{entity.language} language."
|
@@ -18,9 +18,9 @@ module Treat
|
|
18
18
|
if options[:count] == :plural
|
19
19
|
if entity.has?(:category) &&
|
20
20
|
[:noun, :adjective, :verb].include?(entity.category)
|
21
|
-
|
21
|
+
silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
|
22
22
|
else
|
23
|
-
|
23
|
+
silence_warnings { delegate.plural(string) }
|
24
24
|
end
|
25
25
|
end
|
26
26
|
end
|
@@ -2,18 +2,18 @@ module Treat
|
|
2
2
|
module Inflectors
|
3
3
|
module OrdinalWords
|
4
4
|
class Linguistics
|
5
|
-
|
5
|
+
silence_warnings { require 'linguistics' }
|
6
6
|
def self.ordinal_words(number, options = {})
|
7
7
|
begin
|
8
8
|
l = number.language.to_s.upcase
|
9
9
|
delegate = nil
|
10
|
-
|
10
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
11
|
rescue RuntimeError
|
12
|
-
lang = Treat::
|
12
|
+
lang = Treat::Languages.describe(number.language)
|
13
13
|
raise "Ruby Linguistics does not have a module " +
|
14
14
|
" installed for the #{lang} language."
|
15
15
|
end
|
16
|
-
|
16
|
+
silence_warnings { delegate.ordinate(number.to_s) }
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
3
|
+
module Stem
|
4
4
|
# Stems words using the 'ruby-stemmer' gem, which
|
5
5
|
# wraps a C version of the Porter stemming algorithm.
|
6
6
|
#
|
@@ -9,13 +9,13 @@ module Treat
|
|
9
9
|
# Program, Vol. 14, no. 3, pp 130-137,
|
10
10
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
11
|
class PorterC
|
12
|
-
|
12
|
+
silence_warnings { require 'lingua/stemmer' }
|
13
13
|
::LinguaStemmer = ::Lingua
|
14
14
|
Object.instance_eval { remove_const :Lingua }
|
15
15
|
# Stem the word using the Porter C algorithm.
|
16
16
|
# Options: none.
|
17
17
|
def self.stem(word, options = {})
|
18
|
-
|
18
|
+
silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
3
|
+
module Stem
|
4
4
|
# Stems a word using the UEA algorithm, implemented
|
5
5
|
# by the 'uea-stemmer' gem.
|
6
6
|
#
|
@@ -15,13 +15,13 @@ module Treat
|
|
15
15
|
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
16
|
class UEA
|
17
17
|
# Require the 'uea-stemmer' gem.
|
18
|
-
|
18
|
+
silence_warnings { require 'uea-stemmer' }
|
19
19
|
# Keep only one copy of the stemmer.
|
20
20
|
@@stemmer = nil
|
21
21
|
# Stems a word using the UEA algorithm, implemented
|
22
22
|
# by the 'uea-stemmer' gem.
|
23
23
|
def self.stem(entity, options = {})
|
24
|
-
@@stemmer ||=
|
24
|
+
@@stemmer ||= silence_warnings { ::UEAStemmer.new }
|
25
25
|
@@stemmer.stem(entity.to_s).strip
|
26
26
|
end
|
27
27
|
end
|
data/lib/treat/inflectors.rb
CHANGED
@@ -1,44 +1,31 @@
|
|
1
1
|
module Treat
|
2
2
|
# Algorithms to retrieve the inflections of a word.
|
3
|
-
# Stemmers return the stem (not root form) of a word.
|
4
|
-
# Taggers return the part of speech tag of a word.
|
5
|
-
# Inflectors allow to retrieve the different inflections of a
|
6
|
-
# noun (declensions), a verb (conjugations). Lexicons return,
|
7
|
-
# among other things, the gloss or synset of a word.
|
8
3
|
module Inflectors
|
9
|
-
#
|
10
|
-
module
|
4
|
+
# Return the stem (*not root form*) of a word.
|
5
|
+
module Stem
|
11
6
|
extend Group
|
12
7
|
self.type = :annotator
|
13
8
|
self.targets = [:word]
|
14
9
|
end
|
15
|
-
#
|
16
|
-
module
|
10
|
+
# Retrieve the different declensions of a noun (singular, plural).
|
11
|
+
module Declensions
|
17
12
|
extend Group
|
18
13
|
self.type = :annotator
|
19
14
|
self.targets = [:word]
|
20
15
|
end
|
21
|
-
#
|
22
|
-
|
23
|
-
module Declensors
|
16
|
+
# Retrieve the different conjugations of a word.
|
17
|
+
module Conjugations
|
24
18
|
extend Group
|
25
19
|
self.type = :annotator
|
26
20
|
self.targets = [:word]
|
27
21
|
end
|
28
|
-
#
|
29
|
-
# a word.
|
30
|
-
module Conjugators
|
31
|
-
extend Group
|
32
|
-
self.type = :annotator
|
33
|
-
self.targets = [:word]
|
34
|
-
end
|
35
|
-
# Cardinal retrieve the full text description of a number.
|
22
|
+
# Retrieve the full text description of a cardinal number.
|
36
23
|
module CardinalWords
|
37
24
|
extend Group
|
38
25
|
self.type = :annotator
|
39
26
|
self.targets = [:number]
|
40
27
|
end
|
41
|
-
#
|
28
|
+
# Retrieve the full text description of an ordinal number.
|
42
29
|
module OrdinalWords
|
43
30
|
extend Group
|
44
31
|
self.type = :annotator
|
data/lib/treat/kernel.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# Extends the core Kernel module to provide
|
2
|
+
# easy access to utility functions used across
|
3
|
+
# the library.
|
4
|
+
module Kernel
|
5
|
+
require 'fileutils'
|
6
|
+
require 'tempfile'
|
7
|
+
# A list of acronyms used in class names within
|
8
|
+
# the program. These do not CamelCase; they
|
9
|
+
# CAMELCase.
|
10
|
+
Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR'].join('|')
|
11
|
+
# A cache to optimize camel casing.
|
12
|
+
@@cc_cache = {}
|
13
|
+
# A cache to optimize un camel casing.
|
14
|
+
@@ucc_cache = {}
|
15
|
+
# Returns the platform we are running on.
|
16
|
+
def platform
|
17
|
+
RUBY_PLATFORM.split("-")[1]
|
18
|
+
end
|
19
|
+
# Runs a block of code without warnings.
|
20
|
+
def silence_warnings(&block)
|
21
|
+
warn_level = $VERBOSE
|
22
|
+
$VERBOSE = nil
|
23
|
+
result = block.call
|
24
|
+
$VERBOSE = warn_level
|
25
|
+
result
|
26
|
+
end
|
27
|
+
# Runs a block of code while blocking
|
28
|
+
# stdout. Currently not implemented.
|
29
|
+
def silence_streams(*streams)
|
30
|
+
yield
|
31
|
+
end
|
32
|
+
# Create a temporary file which is deleted
|
33
|
+
# after execution of the block.
|
34
|
+
def create_temp_file(ext, value = nil, &block)
|
35
|
+
tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
|
36
|
+
tmp.puts(value) if value
|
37
|
+
block.call(tmp.path)
|
38
|
+
end
|
39
|
+
# Convert un_camel_case to CamelCase.
|
40
|
+
def camel_case(o_phrase)
|
41
|
+
phrase = o_phrase.to_s.dup
|
42
|
+
return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
|
43
|
+
phrase.gsub!(/#{Acronyms.downcase}[^a-z]+/) { |a| a.upcase }
|
44
|
+
phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
|
45
|
+
phrase.gsub!('_', '')
|
46
|
+
@@cc_cache[o_phrase] = phrase
|
47
|
+
phrase
|
48
|
+
end
|
49
|
+
alias :cc :camel_case
|
50
|
+
# Convert CamelCase to un_camel_case.
|
51
|
+
def un_camel_case(o_phrase)
|
52
|
+
phrase = o_phrase.to_s.dup
|
53
|
+
return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
|
54
|
+
phrase.gsub!(/#{Acronyms}/) { |a| a.downcase.capitalize }
|
55
|
+
phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
|
56
|
+
phrase = phrase[1..-1] if phrase[0] == '_'
|
57
|
+
@@ucc_cache[o_phrase] = phrase
|
58
|
+
phrase
|
59
|
+
end
|
60
|
+
alias :ucc :un_camel_case
|
61
|
+
# Retrieve the Class from a Module::Class.
|
62
|
+
def class_name(n); n.to_s.split('::')[-1]; end
|
63
|
+
alias :cl :class_name
|
64
|
+
# Search the list to see if there are words similar to #name
|
65
|
+
# in the #list If yes, return a string saying "Did you mean
|
66
|
+
# ... ?" with the names.
|
67
|
+
def did_you_mean?(list, name)
|
68
|
+
msg = ''
|
69
|
+
sugg = []
|
70
|
+
list.each do |element|
|
71
|
+
l = levenshtein(element,name)
|
72
|
+
if l > 0 && l < 2
|
73
|
+
sugg << element
|
74
|
+
end
|
75
|
+
end
|
76
|
+
unless sugg.empty?
|
77
|
+
if sugg.size == 1
|
78
|
+
msg += " Perhaps you meant '#{sugg[0]}' ?"
|
79
|
+
else
|
80
|
+
sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
|
81
|
+
msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
|
82
|
+
" or '#{sugg[-1]}' ?"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
msg
|
86
|
+
end
|
87
|
+
alias :dym? :did_you_mean?
|
88
|
+
# Return the name of the method that called the method
|
89
|
+
# that calls this method.
|
90
|
+
def caller_method(n = 3)
|
91
|
+
at = caller(n).first
|
92
|
+
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
93
|
+
:"#{Regexp.last_match[3]}"
|
94
|
+
end
|
95
|
+
alias :cm :caller_method
|
96
|
+
# Return the levensthein distance between two stringsm
|
97
|
+
# taking into account the costs of insertion, deletion,
|
98
|
+
# and substitution. Stolen from:
|
99
|
+
# http://ruby-snippets.heroku.com/string/levenshtein-distance
|
100
|
+
# Used by did_you_mean?
|
101
|
+
def levenshtein(first, other, ins=1, del=1, sub=1)
|
102
|
+
return nil if first.nil? || other.nil?
|
103
|
+
dm = []
|
104
|
+
dm[0] = (0..first.length).collect { |i| i * ins}
|
105
|
+
fill = [0] * (first.length - 1)
|
106
|
+
for i in 1..other.length
|
107
|
+
dm[i] = [i * del, fill.flatten]
|
108
|
+
end
|
109
|
+
for i in 1..other.length
|
110
|
+
for j in 1..first.length
|
111
|
+
dm[i][j] = [
|
112
|
+
dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
|
113
|
+
dm[i][j-1] + ins,
|
114
|
+
dm[i-1][j] + del
|
115
|
+
].min
|
116
|
+
end
|
117
|
+
end
|
118
|
+
dm[other.length][first.length]
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class English
|
4
|
+
# A list of all possible word categories.
|
5
|
+
Categories = [
|
6
|
+
:adjective, :adverb, :noun, :verb, :interjection,
|
7
|
+
:clitic, :coverb, :conjunction, :determiner, :particle,
|
8
|
+
:preposition, :pronoun, :number, :symbol, :punctuation,
|
9
|
+
:complementizer
|
10
|
+
]
|
11
|
+
wttc = {}
|
12
|
+
Treat::Languages::English::AlignedWordTags.each_slice(2) do |desc, tags|
|
13
|
+
category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
|
14
|
+
wttc[tags[0]] ||= {}; wttc[tags[1]] ||= {} ;wttc[tags[2]] ||= {}
|
15
|
+
wttc[tags[0]][:claws_5] = category
|
16
|
+
wttc[tags[1]][:brown] = category
|
17
|
+
wttc[tags[2]][:penn] = category
|
18
|
+
end
|
19
|
+
# A hash converting word tags to word categories.
|
20
|
+
WordTagToCategory = wttc
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|