treat 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +1 -0
- data/README +3 -0
- data/TODO +14 -26
- data/bin/INFO +1 -1
- data/lib/treat/buildable.rb +10 -11
- data/lib/treat/categories.rb +8 -6
- data/lib/treat/category.rb +7 -2
- data/lib/treat/delegatable.rb +64 -56
- data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
- data/lib/treat/detectors/language/language_detector.rb +2 -1
- data/lib/treat/detectors/language/what_language.rb +2 -2
- data/lib/treat/detectors.rb +3 -0
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/entities.rb +9 -10
- data/lib/treat/exception.rb +3 -1
- data/lib/treat/extractors/named_entity/abner.rb +1 -1
- data/lib/treat/extractors/named_entity/stanford.rb +2 -2
- data/lib/treat/extractors/time/chronic.rb +2 -2
- data/lib/treat/extractors/time/nickel.rb +2 -2
- data/lib/treat/extractors/topic_words/lda.rb +2 -2
- data/lib/treat/extractors.rb +12 -9
- data/lib/treat/feature.rb +6 -1
- data/lib/treat/formatters/cleaners/html.rb +1 -1
- data/lib/treat/formatters.rb +8 -8
- data/lib/treat/group.rb +11 -10
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
- data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
- data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
- data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
- data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
- data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
- data/lib/treat/inflectors.rb +8 -21
- data/lib/treat/kernel.rb +120 -0
- data/lib/treat/languages/arabic.rb +14 -0
- data/lib/treat/languages/categories.rb +5 -0
- data/lib/treat/languages/chinese.rb +12 -0
- data/lib/treat/languages/english/categories.rb +23 -0
- data/lib/treat/{resources → languages/english}/tags.rb +127 -184
- data/lib/treat/languages/english.rb +33 -0
- data/lib/treat/languages/french.rb +17 -0
- data/lib/treat/languages/german.rb +17 -0
- data/lib/treat/languages/italian.rb +14 -0
- data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
- data/lib/treat/languages/xinhua.rb +12 -0
- data/lib/treat/languages.rb +91 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
- data/lib/treat/lexicalizers/tag/brill.rb +2 -1
- data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
- data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
- data/lib/treat/lexicalizers.rb +1 -1
- data/lib/treat/object.rb +6 -0
- data/lib/treat/processors/parsers/enju.rb +3 -2
- data/lib/treat/processors/parsers/stanford.rb +15 -12
- data/lib/treat/processors/segmenters/punkt.rb +1 -1
- data/lib/treat/processors/segmenters/stanford.rb +7 -5
- data/lib/treat/processors/segmenters/tactful.rb +1 -1
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
- data/lib/treat/processors/tokenizers/stanford.rb +7 -5
- data/lib/treat/visitable.rb +2 -1
- data/lib/treat.rb +105 -54
- data/test/tc_entity.rb +5 -0
- data/test/tc_resources.rb +5 -5
- data/test/tc_treat.rb +1 -2
- data/test/tests.rb +2 -1
- metadata +63 -64
- data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
- data/lib/treat/resources/categories.rb +0 -18
- data/lib/treat/resources/delegates.rb +0 -96
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +0 -8
- data/lib/treat/resources/formats.rb +0 -23
- data/lib/treat/resources/languages.rb +0 -86
- data/lib/treat/resources.rb +0 -10
- data/lib/treat/utilities.rb +0 -127
data/lib/treat/group.rb
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
module Treat
|
2
2
|
module Group
|
3
|
+
# Modify the extended class.
|
3
4
|
def self.extended(group)
|
4
5
|
group.module_eval do
|
5
6
|
class << self
|
@@ -75,22 +76,22 @@ module Treat
|
|
75
76
|
end
|
76
77
|
@@list[mod]
|
77
78
|
end
|
78
|
-
#
|
79
|
-
|
80
|
-
|
81
|
-
end
|
79
|
+
# Get constants in this module, excluding those
|
80
|
+
# defined by parent modules.
|
81
|
+
def const_get(const); super(const, false); end
|
82
82
|
# Autoload the algorithms.
|
83
83
|
def const_missing(const)
|
84
84
|
bits = self.ancestors[0].to_s.split('::')
|
85
85
|
bits.collect! { |bit| ucc(bit) }
|
86
|
-
file = bits.join('/') + "/#{ucc(const)}"
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
86
|
+
file = bits.join('/') + "/#{ucc(const)}"
|
87
|
+
if not File.readable?("#{Treat.lib}/#{file}.rb")
|
88
|
+
raise Treat::Exception,
|
89
|
+
"File '#{file}.rb' corresponding to requested delegate "+
|
90
|
+
"#{self}::#{const} does not exist."
|
91
|
+
else
|
91
92
|
require file
|
92
93
|
const_get(const)
|
93
|
-
|
94
|
+
end
|
94
95
|
end
|
95
96
|
end
|
96
97
|
end
|
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
module Inflectors
|
3
3
|
module CardinalWords
|
4
4
|
class Linguistics
|
5
|
-
|
5
|
+
silence_warnings { require 'linguistics' }
|
6
6
|
#
|
7
7
|
# Options:
|
8
8
|
#
|
@@ -32,12 +32,12 @@ module Treat
|
|
32
32
|
begin
|
33
33
|
l = entity.language.to_s.upcase
|
34
34
|
delegate = nil
|
35
|
-
|
35
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
36
36
|
rescue RuntimeError
|
37
37
|
raise "Ruby Linguistics does not have a module " +
|
38
38
|
" installed for the #{entity.language} language."
|
39
39
|
end
|
40
|
-
|
40
|
+
silence_warnings { delegate.numwords(entity.to_s, options) }
|
41
41
|
end
|
42
42
|
end
|
43
43
|
end
|
@@ -1,23 +1,23 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
3
|
+
module Conjugations
|
4
4
|
class Linguistics
|
5
|
-
|
5
|
+
silence_warnings { require 'linguistics' }
|
6
6
|
def self.conjugate(entity, parameters)
|
7
7
|
begin
|
8
8
|
l = entity.language.to_s.upcase
|
9
9
|
delegate = nil
|
10
|
-
|
10
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
11
|
rescue RuntimeError
|
12
12
|
raise "Ruby Linguistics does not have a module " +
|
13
13
|
" installed for the #{entity.language} language."
|
14
14
|
end
|
15
15
|
if parameters[:mode] == :infinitive
|
16
|
-
|
16
|
+
silence_warnings { delegate.infinitive(entity.to_s) }
|
17
17
|
elsif parameters[:mode] == :participle && parameters[:tense] == :present
|
18
|
-
|
18
|
+
silence_warnings { delegate.present_participle(entity.to_s) }
|
19
19
|
elsif parameters[:count] == :plural && parameters.size == 1
|
20
|
-
|
20
|
+
silence_warnings { delegate.plural_verb(entity.to_s) }
|
21
21
|
else
|
22
22
|
raise Treat::Exception,
|
23
23
|
'This combination of modes, tenses, persons ' +
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
4
|
-
|
3
|
+
module Declensions
|
4
|
+
silence_warnings { require 'linguistics' }
|
5
5
|
# Obtain word declensions in English using the
|
6
6
|
# ruby 'linguistics' gem.
|
7
7
|
class Linguistics
|
@@ -9,7 +9,7 @@ module Treat
|
|
9
9
|
begin
|
10
10
|
l = entity.language.to_s.upcase
|
11
11
|
delegate = nil
|
12
|
-
|
12
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
13
13
|
rescue RuntimeError
|
14
14
|
raise "Ruby Linguistics does not have a module " +
|
15
15
|
" installed for the #{entity.language} language."
|
@@ -18,9 +18,9 @@ module Treat
|
|
18
18
|
if options[:count] == :plural
|
19
19
|
if entity.has?(:category) &&
|
20
20
|
[:noun, :adjective, :verb].include?(entity.category)
|
21
|
-
|
21
|
+
silence_warnings { delegate.send(:"plural_#{entity.category}", string) }
|
22
22
|
else
|
23
|
-
|
23
|
+
silence_warnings { delegate.plural(string) }
|
24
24
|
end
|
25
25
|
end
|
26
26
|
end
|
@@ -2,18 +2,18 @@ module Treat
|
|
2
2
|
module Inflectors
|
3
3
|
module OrdinalWords
|
4
4
|
class Linguistics
|
5
|
-
|
5
|
+
silence_warnings { require 'linguistics' }
|
6
6
|
def self.ordinal_words(number, options = {})
|
7
7
|
begin
|
8
8
|
l = number.language.to_s.upcase
|
9
9
|
delegate = nil
|
10
|
-
|
10
|
+
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
11
11
|
rescue RuntimeError
|
12
|
-
lang = Treat::
|
12
|
+
lang = Treat::Languages.describe(number.language)
|
13
13
|
raise "Ruby Linguistics does not have a module " +
|
14
14
|
" installed for the #{lang} language."
|
15
15
|
end
|
16
|
-
|
16
|
+
silence_warnings { delegate.ordinate(number.to_s) }
|
17
17
|
end
|
18
18
|
end
|
19
19
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
3
|
+
module Stem
|
4
4
|
# Stems words using the 'ruby-stemmer' gem, which
|
5
5
|
# wraps a C version of the Porter stemming algorithm.
|
6
6
|
#
|
@@ -9,13 +9,13 @@ module Treat
|
|
9
9
|
# Program, Vol. 14, no. 3, pp 130-137,
|
10
10
|
# Original C implementation: http://www.tartarus.org/~martin/PorterStemmer.
|
11
11
|
class PorterC
|
12
|
-
|
12
|
+
silence_warnings { require 'lingua/stemmer' }
|
13
13
|
::LinguaStemmer = ::Lingua
|
14
14
|
Object.instance_eval { remove_const :Lingua }
|
15
15
|
# Stem the word using the Porter C algorithm.
|
16
16
|
# Options: none.
|
17
17
|
def self.stem(word, options = {})
|
18
|
-
|
18
|
+
silence_warnings { ::LinguaStemmer.stemmer(word.to_s) }
|
19
19
|
end
|
20
20
|
end
|
21
21
|
end
|
@@ -1,6 +1,6 @@
|
|
1
1
|
module Treat
|
2
2
|
module Inflectors
|
3
|
-
module
|
3
|
+
module Stem
|
4
4
|
# Stems a word using the UEA algorithm, implemented
|
5
5
|
# by the 'uea-stemmer' gem.
|
6
6
|
#
|
@@ -15,13 +15,13 @@ module Treat
|
|
15
15
|
# http://www.uea.ac.uk/polopoly_fs/1.85493!stemmer25feb.pdf
|
16
16
|
class UEA
|
17
17
|
# Require the 'uea-stemmer' gem.
|
18
|
-
|
18
|
+
silence_warnings { require 'uea-stemmer' }
|
19
19
|
# Keep only one copy of the stemmer.
|
20
20
|
@@stemmer = nil
|
21
21
|
# Stems a word using the UEA algorithm, implemented
|
22
22
|
# by the 'uea-stemmer' gem.
|
23
23
|
def self.stem(entity, options = {})
|
24
|
-
@@stemmer ||=
|
24
|
+
@@stemmer ||= silence_warnings { ::UEAStemmer.new }
|
25
25
|
@@stemmer.stem(entity.to_s).strip
|
26
26
|
end
|
27
27
|
end
|
data/lib/treat/inflectors.rb
CHANGED
@@ -1,44 +1,31 @@
|
|
1
1
|
module Treat
|
2
2
|
# Algorithms to retrieve the inflections of a word.
|
3
|
-
# Stemmers return the stem (not root form) of a word.
|
4
|
-
# Taggers return the part of speech tag of a word.
|
5
|
-
# Inflectors allow to retrieve the different inflections of a
|
6
|
-
# noun (declensions), a verb (conjugations). Lexicons return,
|
7
|
-
# among other things, the gloss or synset of a word.
|
8
3
|
module Inflectors
|
9
|
-
#
|
10
|
-
module
|
4
|
+
# Return the stem (*not root form*) of a word.
|
5
|
+
module Stem
|
11
6
|
extend Group
|
12
7
|
self.type = :annotator
|
13
8
|
self.targets = [:word]
|
14
9
|
end
|
15
|
-
#
|
16
|
-
module
|
10
|
+
# Retrieve the different declensions of a noun (singular, plural).
|
11
|
+
module Declensions
|
17
12
|
extend Group
|
18
13
|
self.type = :annotator
|
19
14
|
self.targets = [:word]
|
20
15
|
end
|
21
|
-
#
|
22
|
-
|
23
|
-
module Declensors
|
16
|
+
# Retrieve the different conjugations of a word.
|
17
|
+
module Conjugations
|
24
18
|
extend Group
|
25
19
|
self.type = :annotator
|
26
20
|
self.targets = [:word]
|
27
21
|
end
|
28
|
-
#
|
29
|
-
# a word.
|
30
|
-
module Conjugators
|
31
|
-
extend Group
|
32
|
-
self.type = :annotator
|
33
|
-
self.targets = [:word]
|
34
|
-
end
|
35
|
-
# Cardinal retrieve the full text description of a number.
|
22
|
+
# Retrieve the full text description of a cardinal number.
|
36
23
|
module CardinalWords
|
37
24
|
extend Group
|
38
25
|
self.type = :annotator
|
39
26
|
self.targets = [:number]
|
40
27
|
end
|
41
|
-
#
|
28
|
+
# Retrieve the full text description of an ordinal number.
|
42
29
|
module OrdinalWords
|
43
30
|
extend Group
|
44
31
|
self.type = :annotator
|
data/lib/treat/kernel.rb
ADDED
@@ -0,0 +1,120 @@
|
|
1
|
+
# Extends the core Kernel module to provide
|
2
|
+
# easy access to utility functions used across
|
3
|
+
# the library.
|
4
|
+
module Kernel
|
5
|
+
require 'fileutils'
|
6
|
+
require 'tempfile'
|
7
|
+
# A list of acronyms used in class names within
|
8
|
+
# the program. These do not CamelCase; they
|
9
|
+
# CAMELCase.
|
10
|
+
Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF', 'GOCR'].join('|')
|
11
|
+
# A cache to optimize camel casing.
|
12
|
+
@@cc_cache = {}
|
13
|
+
# A cache to optimize un camel casing.
|
14
|
+
@@ucc_cache = {}
|
15
|
+
# Returns the platform we are running on.
|
16
|
+
def platform
|
17
|
+
RUBY_PLATFORM.split("-")[1]
|
18
|
+
end
|
19
|
+
# Runs a block of code without warnings.
|
20
|
+
def silence_warnings(&block)
|
21
|
+
warn_level = $VERBOSE
|
22
|
+
$VERBOSE = nil
|
23
|
+
result = block.call
|
24
|
+
$VERBOSE = warn_level
|
25
|
+
result
|
26
|
+
end
|
27
|
+
# Runs a block of code while blocking
|
28
|
+
# stdout. Currently not implemented.
|
29
|
+
def silence_streams(*streams)
|
30
|
+
yield
|
31
|
+
end
|
32
|
+
# Create a temporary file which is deleted
|
33
|
+
# after execution of the block.
|
34
|
+
def create_temp_file(ext, value = nil, &block)
|
35
|
+
tmp = Tempfile.new(['', ".#{ext.to_s}"], Treat.tmp)
|
36
|
+
tmp.puts(value) if value
|
37
|
+
block.call(tmp.path)
|
38
|
+
end
|
39
|
+
# Convert un_camel_case to CamelCase.
|
40
|
+
def camel_case(o_phrase)
|
41
|
+
phrase = o_phrase.to_s.dup
|
42
|
+
return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
|
43
|
+
phrase.gsub!(/#{Acronyms.downcase}[^a-z]+/) { |a| a.upcase }
|
44
|
+
phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
|
45
|
+
phrase.gsub!('_', '')
|
46
|
+
@@cc_cache[o_phrase] = phrase
|
47
|
+
phrase
|
48
|
+
end
|
49
|
+
alias :cc :camel_case
|
50
|
+
# Convert CamelCase to un_camel_case.
|
51
|
+
def un_camel_case(o_phrase)
|
52
|
+
phrase = o_phrase.to_s.dup
|
53
|
+
return @@ucc_cache[o_phrase] if @@ucc_cache[o_phrase]
|
54
|
+
phrase.gsub!(/#{Acronyms}/) { |a| a.downcase.capitalize }
|
55
|
+
phrase.gsub!(/[A-Z]/) { |p| '_' + p.downcase }
|
56
|
+
phrase = phrase[1..-1] if phrase[0] == '_'
|
57
|
+
@@ucc_cache[o_phrase] = phrase
|
58
|
+
phrase
|
59
|
+
end
|
60
|
+
alias :ucc :un_camel_case
|
61
|
+
# Retrieve the Class from a Module::Class.
|
62
|
+
def class_name(n); n.to_s.split('::')[-1]; end
|
63
|
+
alias :cl :class_name
|
64
|
+
# Search the list to see if there are words similar to #name
|
65
|
+
# in the #list If yes, return a string saying "Did you mean
|
66
|
+
# ... ?" with the names.
|
67
|
+
def did_you_mean?(list, name)
|
68
|
+
msg = ''
|
69
|
+
sugg = []
|
70
|
+
list.each do |element|
|
71
|
+
l = levenshtein(element,name)
|
72
|
+
if l > 0 && l < 2
|
73
|
+
sugg << element
|
74
|
+
end
|
75
|
+
end
|
76
|
+
unless sugg.empty?
|
77
|
+
if sugg.size == 1
|
78
|
+
msg += " Perhaps you meant '#{sugg[0]}' ?"
|
79
|
+
else
|
80
|
+
sugg_quote = sugg[0..-2].map {|x| '\'' + x + '\''}
|
81
|
+
msg += " Perhaps you meant #{sugg_quote.join(', ')}," +
|
82
|
+
" or '#{sugg[-1]}' ?"
|
83
|
+
end
|
84
|
+
end
|
85
|
+
msg
|
86
|
+
end
|
87
|
+
alias :dym? :did_you_mean?
|
88
|
+
# Return the name of the method that called the method
|
89
|
+
# that calls this method.
|
90
|
+
def caller_method(n = 3)
|
91
|
+
at = caller(n).first
|
92
|
+
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
93
|
+
:"#{Regexp.last_match[3]}"
|
94
|
+
end
|
95
|
+
alias :cm :caller_method
|
96
|
+
# Return the levensthein distance between two stringsm
|
97
|
+
# taking into account the costs of insertion, deletion,
|
98
|
+
# and substitution. Stolen from:
|
99
|
+
# http://ruby-snippets.heroku.com/string/levenshtein-distance
|
100
|
+
# Used by did_you_mean?
|
101
|
+
def levenshtein(first, other, ins=1, del=1, sub=1)
|
102
|
+
return nil if first.nil? || other.nil?
|
103
|
+
dm = []
|
104
|
+
dm[0] = (0..first.length).collect { |i| i * ins}
|
105
|
+
fill = [0] * (first.length - 1)
|
106
|
+
for i in 1..other.length
|
107
|
+
dm[i] = [i * del, fill.flatten]
|
108
|
+
end
|
109
|
+
for i in 1..other.length
|
110
|
+
for j in 1..first.length
|
111
|
+
dm[i][j] = [
|
112
|
+
dm[i-1][j-1] + (first[i-1] == other[i-1] ? 0 : sub),
|
113
|
+
dm[i][j-1] + ins,
|
114
|
+
dm[i-1][j] + del
|
115
|
+
].min
|
116
|
+
end
|
117
|
+
end
|
118
|
+
dm[other.length][first.length]
|
119
|
+
end
|
120
|
+
end
|
@@ -0,0 +1,23 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class English
|
4
|
+
# A list of all possible word categories.
|
5
|
+
Categories = [
|
6
|
+
:adjective, :adverb, :noun, :verb, :interjection,
|
7
|
+
:clitic, :coverb, :conjunction, :determiner, :particle,
|
8
|
+
:preposition, :pronoun, :number, :symbol, :punctuation,
|
9
|
+
:complementizer
|
10
|
+
]
|
11
|
+
wttc = {}
|
12
|
+
Treat::Languages::English::AlignedWordTags.each_slice(2) do |desc, tags|
|
13
|
+
category = desc.gsub(',', ' ,').split(' ')[0].downcase.intern
|
14
|
+
wttc[tags[0]] ||= {}; wttc[tags[1]] ||= {} ;wttc[tags[2]] ||= {}
|
15
|
+
wttc[tags[0]][:claws_5] = category
|
16
|
+
wttc[tags[1]][:brown] = category
|
17
|
+
wttc[tags[2]][:penn] = category
|
18
|
+
end
|
19
|
+
# A hash converting word tags to word categories.
|
20
|
+
WordTagToCategory = wttc
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|