treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
@@ -7,31 +7,32 @@ module Treat
|
|
7
7
|
#
|
8
8
|
# Project website: http://deveiate.org/projects/Linguistics/
|
9
9
|
class Linguistics
|
10
|
-
|
11
|
-
silence_warnings { require 'linguistics' }
|
10
|
+
require 'treat/helpers/linguistics_loader'
|
12
11
|
# Retrieve a declension of a word using the 'linguistics' gem.
|
13
12
|
#
|
14
13
|
# Options:
|
15
14
|
#
|
16
15
|
# - (Identifier) :count => :singular, :plural
|
17
16
|
def self.declensions(entity, options = {})
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
silence_warnings { delegate = ::Linguistics.const_get(l) }
|
22
|
-
rescue RuntimeError
|
23
|
-
raise "Ruby Linguistics does not have a module " +
|
24
|
-
" installed for the #{entity.language} language."
|
17
|
+
unless options[:count]
|
18
|
+
raise Treat::Exception,
|
19
|
+
"Must supply option count (:singular or :plural)."
|
25
20
|
end
|
21
|
+
klass = Treat::Helpers::LinguisticsLoader.load(entity.language)
|
26
22
|
string = entity.to_s
|
23
|
+
if entity.category == :verb
|
24
|
+
raise Treat::Exception,
|
25
|
+
"Cannot retrieve the declensions of a verb. " +
|
26
|
+
"Use #singular_verb and #plural_verb instead."
|
27
|
+
end
|
27
28
|
if options[:count] == :plural
|
28
29
|
if entity.has?(:category) &&
|
29
30
|
[:noun, :adjective, :verb].include?(entity.category)
|
30
31
|
silence_warnings do
|
31
|
-
|
32
|
+
klass.send(:"plural_#{entity.category}", string)
|
32
33
|
end
|
33
34
|
else
|
34
|
-
silence_warnings {
|
35
|
+
silence_warnings { klass.plural(string) }
|
35
36
|
end
|
36
37
|
end
|
37
38
|
end
|
@@ -7,12 +7,12 @@ module Treat
|
|
7
7
|
#
|
8
8
|
# Project website: http://deveiate.org/projects/Linguistics/
|
9
9
|
class Linguistics
|
10
|
-
|
11
|
-
silence_warnings { require 'linguistics' }
|
10
|
+
require 'treat/helpers/linguistics_loader'
|
12
11
|
# Desribe a number in words in ordinal form, using the
|
13
12
|
# 'linguistics' gem.
|
14
13
|
def self.ordinal_words(number, options = {})
|
15
|
-
|
14
|
+
klass = Treat::Helpers::LinguisticsLoader.load(number.language)
|
15
|
+
klass.ordinate(number.to_s)
|
16
16
|
end
|
17
17
|
end
|
18
18
|
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Treat
|
2
|
+
class Installer
|
3
|
+
require 'rubygems/dependency_installer'
|
4
|
+
# Install required dependencies and optional dependencies
|
5
|
+
# for a specific language.
|
6
|
+
def self.install(language = :english)
|
7
|
+
|
8
|
+
lang = Treat::Languages.get(language)
|
9
|
+
required = lang::RequiredDependencies
|
10
|
+
optional = lang::OptionalDependencies
|
11
|
+
|
12
|
+
puts "Treat Installer\n\n"
|
13
|
+
puts "Installing dependencies for the #{language.to_s.capitalize} language.\n\n"
|
14
|
+
|
15
|
+
flag = false
|
16
|
+
|
17
|
+
inst = Gem::DependencyInstaller.new
|
18
|
+
|
19
|
+
required.each do |dependency|
|
20
|
+
puts "Installing required dependency '#{dependency}'..."
|
21
|
+
begin
|
22
|
+
silence_warnings { inst.install(dependency) }
|
23
|
+
rescue
|
24
|
+
flag = true
|
25
|
+
puts "Couldn't install '#{dependency}'. " +
|
26
|
+
"You need install this dependency manually by running: " +
|
27
|
+
"'gem install #{dependency}' or use 'sudo' to run this script."
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
optional.each do |dependency|
|
32
|
+
begin
|
33
|
+
puts "Install optional dependency '#{dependency}' (yes/no, <enter> = skip) ?"
|
34
|
+
answer = gets.strip
|
35
|
+
raise Treat::Exception unless ['yes', 'no', ''].include?(answer)
|
36
|
+
if answer == 'yes'
|
37
|
+
silence_warnings { inst.install(dependency) }
|
38
|
+
else
|
39
|
+
puts "Skipped installing '#{dependency}'."
|
40
|
+
next
|
41
|
+
end
|
42
|
+
rescue Treat::Exception
|
43
|
+
puts "Invalid input - valid options are 'yes' or 'no'."
|
44
|
+
retry
|
45
|
+
rescue
|
46
|
+
flag = true
|
47
|
+
puts "Couldn't install '#{dependency}'. " +
|
48
|
+
"You can install this dependency manually by running: " +
|
49
|
+
"'gem install #{dependency}' or use 'sudo' to run this script."
|
50
|
+
end
|
51
|
+
end
|
52
|
+
|
53
|
+
w = flag ? 'incompletely' : 'normally'
|
54
|
+
puts "\nInstall proceeded #{w}."
|
55
|
+
puts
|
56
|
+
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
data/lib/treat/kernel.rb
CHANGED
@@ -7,7 +7,7 @@ module Kernel
|
|
7
7
|
# A list of acronyms used in class names within
|
8
8
|
# the program. These do not CamelCase; they
|
9
9
|
# CAMELCase.
|
10
|
-
Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF'
|
10
|
+
Acronyms = ['XML', 'HTML', 'YAML', 'UEA', 'LDA', 'PDF'].join('|')
|
11
11
|
# A cache to optimize camel casing.
|
12
12
|
@@cc_cache = {}
|
13
13
|
# A cache to optimize un camel casing.
|
@@ -24,15 +24,17 @@ module Kernel
|
|
24
24
|
$VERBOSE = warn_level
|
25
25
|
result
|
26
26
|
end
|
27
|
-
# Runs a block of code while blocking
|
28
|
-
|
29
|
-
|
27
|
+
# Runs a block of code while blocking stdout.
|
28
|
+
def silence_stdout(log = '/dev/null')
|
29
|
+
old = $stdout.dup
|
30
|
+
$stdout.reopen(File.new(log, 'w'))
|
30
31
|
yield
|
32
|
+
$stdout = old
|
31
33
|
end
|
32
34
|
# Create a temporary file which is deleted
|
33
35
|
# after execution of the block.
|
34
36
|
def create_temp_file(ext, value = nil, &block)
|
35
|
-
fname = "
|
37
|
+
fname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}.#{ext}"
|
36
38
|
File.open(fname, 'w') do |f|
|
37
39
|
f.write(value) if value
|
38
40
|
block.call(f.path)
|
@@ -40,11 +42,19 @@ module Kernel
|
|
40
42
|
ensure
|
41
43
|
File.delete(fname)
|
42
44
|
end
|
45
|
+
# Create a temporary directory.
|
46
|
+
def create_temp_dir(&block)
|
47
|
+
dname = "#{Treat.lib}/../tmp/#{Random.rand(10000000).to_s}"
|
48
|
+
Dir.mkdir(dname)
|
49
|
+
block.call(dname)
|
50
|
+
ensure
|
51
|
+
FileUtils.rm_rf(dname)
|
52
|
+
end
|
43
53
|
# Convert un_camel_case to CamelCase.
|
44
54
|
def camel_case(o_phrase)
|
45
55
|
phrase = o_phrase.to_s.dup
|
46
56
|
return @@cc_cache[o_phrase] if @@cc_cache[o_phrase]
|
47
|
-
phrase.gsub!(/#{Acronyms.downcase}[^a-z]
|
57
|
+
phrase.gsub!(/#{Acronyms.downcase}[^a-z]*/) { |a| a.upcase }
|
48
58
|
phrase.gsub!(/^[a-z]|_[a-z]/) { |a| a.upcase }
|
49
59
|
phrase.gsub!('_', '')
|
50
60
|
@@cc_cache[o_phrase] = phrase
|
@@ -77,7 +87,7 @@ module Kernel
|
|
77
87
|
sugg << element
|
78
88
|
end
|
79
89
|
end
|
80
|
-
unless sugg.
|
90
|
+
unless sugg.size == 0
|
81
91
|
if sugg.size == 1
|
82
92
|
msg += " Perhaps you meant '#{sugg[0]}' ?"
|
83
93
|
else
|
@@ -94,7 +104,7 @@ module Kernel
|
|
94
104
|
def caller_method(n = 3)
|
95
105
|
at = caller(n).first
|
96
106
|
/^(.+?):(\d+)(?::in `(.*)')?/ =~ at
|
97
|
-
|
107
|
+
Regexp.last_match[3].intern
|
98
108
|
end
|
99
109
|
alias :cm :caller_method
|
100
110
|
# Return the levensthein distance between two stringsm
|
data/lib/treat/languages.rb
CHANGED
@@ -12,7 +12,7 @@ module Treat
|
|
12
12
|
# or its full text description in full French or English.
|
13
13
|
def self.describe(lang, desc_lang = :en)
|
14
14
|
raise "Must provide a non-nil language identifier to describe." if lang.nil?
|
15
|
-
lang =
|
15
|
+
lang = code(lang).to_s
|
16
16
|
if [:en, :eng, :english, :anglais].include?(desc_lang)
|
17
17
|
l = @@english_full.key(lang)
|
18
18
|
elsif [:fr, :fra, :french, :french].include?(desc_lang)
|
@@ -42,22 +42,22 @@ module Treat
|
|
42
42
|
# or full name (in English or French) and return
|
43
43
|
# the ISO-639-1 or ISO-639-2 language code as a
|
44
44
|
# lowercase identifier.
|
45
|
-
def self.
|
45
|
+
def self.code(lang, rc = ISO639_2)
|
46
46
|
raise "Must provide a non-nil language identifier to describe." if lang.nil?
|
47
47
|
get_languages
|
48
48
|
lang = lang.to_s.downcase
|
49
49
|
if @@iso639_1.has_key?(lang)
|
50
|
-
return
|
51
|
-
return
|
50
|
+
return lang.intern if rc == ISO639_1
|
51
|
+
return @@iso639_1[lang].intern if rc == ISO639_2
|
52
52
|
elsif @@iso639_2.has_key?(lang)
|
53
|
-
return
|
54
|
-
return
|
53
|
+
return lang.intern if rc == ISO639_2
|
54
|
+
return @@iso639_2[lang].intern if rc == ISO639_1
|
55
55
|
elsif @@english_full.has_key?(lang)
|
56
|
-
return
|
57
|
-
return
|
56
|
+
return @@english_full[lang].intern if rc == ISO639_2
|
57
|
+
return @@iso639_2[@@english_full[lang]].intern if rc == ISO639_1
|
58
58
|
elsif @@french_full.has_key?(lang)
|
59
|
-
return
|
60
|
-
return
|
59
|
+
return @@french_full[lang].intern if rc == ISO639_2
|
60
|
+
return @@iso639_1[@@french_full[lang]].intern if rc == ISO639_2
|
61
61
|
else
|
62
62
|
not_found(lang)
|
63
63
|
end
|
@@ -87,5 +87,12 @@ module Treat
|
|
87
87
|
end
|
88
88
|
@@loaded = true
|
89
89
|
end
|
90
|
+
# A list of all possible word categories.
|
91
|
+
WordCategories = [
|
92
|
+
:adjective, :adverb, :noun, :verb, :interjection,
|
93
|
+
:clitic, :coverb, :conjunction, :determiner, :particle,
|
94
|
+
:preposition, :pronoun, :number, :symbol, :punctuation,
|
95
|
+
:complementizer
|
96
|
+
]
|
90
97
|
end
|
91
|
-
end
|
98
|
+
end
|
@@ -1,13 +1,15 @@
|
|
1
1
|
module Treat
|
2
2
|
module Languages
|
3
3
|
class Arabic
|
4
|
+
RequiredDependencies = []
|
5
|
+
OptionalDependencies = []
|
4
6
|
Extractors = {}
|
5
7
|
Inflectors = {}
|
6
8
|
Lexicalizers = {
|
7
|
-
tag
|
9
|
+
:tag => [:stanford]
|
8
10
|
}
|
9
11
|
Processors = {
|
10
|
-
parsers
|
12
|
+
:parsers => [:stanford]
|
11
13
|
}
|
12
14
|
end
|
13
15
|
end
|
@@ -1,12 +1,16 @@
|
|
1
1
|
module Treat
|
2
2
|
module Languages
|
3
3
|
class Chinese
|
4
|
+
RequiredDependencies = []
|
5
|
+
OptionalDependencies = []
|
4
6
|
Extractors = {}
|
5
7
|
Inflectors = {}
|
6
8
|
Lexicalizers = {
|
7
|
-
tag
|
9
|
+
:tag => [:stanford]
|
10
|
+
}
|
11
|
+
Processors = {
|
12
|
+
:parsers => [:stanford]
|
8
13
|
}
|
9
|
-
Processors = {}
|
10
14
|
end
|
11
15
|
end
|
12
16
|
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class Dutch
|
4
|
+
RequiredDependencies = []
|
5
|
+
OptionalDependencies = []
|
6
|
+
Processors = {
|
7
|
+
:chunkers => [:txt],
|
8
|
+
:segmenters => [:punkt],
|
9
|
+
:tokenizers => [:tactful]
|
10
|
+
}
|
11
|
+
Extractors = {}
|
12
|
+
Inflectors = {}
|
13
|
+
Lexicalizers = {}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
@@ -2,34 +2,62 @@ module Treat
|
|
2
2
|
module Languages
|
3
3
|
class English
|
4
4
|
|
5
|
-
|
6
|
-
|
5
|
+
RequiredDependencies = ['rbtagger', 'ruby-stemmer', 'tactful_tokenizer', 'nickel', 'wordnet']
|
6
|
+
OptionalDependencies = ['uea-stemmer', 'tokenizer', 'engtagger']
|
7
7
|
|
8
8
|
Extractors = {
|
9
|
-
time
|
10
|
-
|
11
|
-
|
12
|
-
|
9
|
+
:time => [:nickel],
|
10
|
+
:date => [:chronic, :ruby],
|
11
|
+
:topics => [:reuters],
|
12
|
+
:topic_words => [:lda],
|
13
|
+
:keywords => [:tf_idf, :topics_tf_idf],
|
14
|
+
:named_entity_tag => [:stanford],
|
15
|
+
:coreferences => [:stanford],
|
16
|
+
:roles => [:naive]
|
13
17
|
}
|
18
|
+
|
14
19
|
Processors = {
|
15
|
-
chunkers
|
16
|
-
parsers
|
17
|
-
segmenters
|
18
|
-
tokenizers
|
20
|
+
:chunkers => [:txt],
|
21
|
+
:parsers => [:stanford, :enju],
|
22
|
+
:segmenters => [:tactful, :punkt, :stanford],
|
23
|
+
:tokenizers => [:macintyre, :multilingual, :perl, :punkt, :stanford, :tactful]
|
19
24
|
}
|
25
|
+
|
20
26
|
Lexicalizers = {
|
21
|
-
category
|
22
|
-
linkages
|
23
|
-
synsets
|
24
|
-
tag
|
27
|
+
:category => [:from_tag],
|
28
|
+
:linkages => [:naive],
|
29
|
+
:synsets => [:wordnet],
|
30
|
+
:tag => [:brill, :lingua, :stanford]
|
25
31
|
}
|
32
|
+
|
26
33
|
Inflectors = {
|
27
|
-
conjugations
|
28
|
-
declensions
|
29
|
-
stem
|
30
|
-
ordinal_words
|
31
|
-
cardinal_words
|
34
|
+
:conjugations => [:linguistics],
|
35
|
+
:declensions => [:english, :linguistics],
|
36
|
+
:stem => [:porter, :porter_c, :uea],
|
37
|
+
:ordinal_words => [:linguistics],
|
38
|
+
:cardinal_words => [:linguistics]
|
32
39
|
}
|
40
|
+
|
41
|
+
CommonWords = [
|
42
|
+
'the', 'of', 'and', 'a', 'to', 'in', 'is',
|
43
|
+
'you', 'that', 'it', 'he', 'was', 'for', 'on',
|
44
|
+
'are', 'as', 'with', 'his', 'they', 'I', 'at',
|
45
|
+
'be', 'this', 'have', 'from', 'or', 'one', 'had',
|
46
|
+
'by', 'word', 'but', 'not', 'what', 'all', 'were',
|
47
|
+
'we', 'when', 'your', 'can', 'said', 'there', 'use',
|
48
|
+
'an', 'each', 'which', 'she', 'do', 'how', 'their',
|
49
|
+
'if', 'will', 'up', 'other', 'about', 'out', 'many',
|
50
|
+
'then', 'them', 'these', 'so', 'some', 'her', 'would',
|
51
|
+
'make', 'like', 'him', 'into', 'time', 'has', 'look',
|
52
|
+
'two', 'more', 'write', 'go', 'see', 'number', 'no',
|
53
|
+
'way', 'could', 'people', 'my', 'than', 'first', 'been',
|
54
|
+
'call', 'who', 'its', 'now', 'find', 'long', 'down',
|
55
|
+
'day', 'did', 'get', 'come', 'made', 'may', 'part',
|
56
|
+
'say', 'also', 'new', 'much', 'should', 'still',
|
57
|
+
'such', 'before', 'after', 'other', 'then', 'over',
|
58
|
+
'under', 'therefore', 'nonetheless', 'thereafter',
|
59
|
+
'afterwards', 'here', 'huh', 'hah', "'nt", "'t", 'here'
|
60
|
+
]
|
33
61
|
|
34
62
|
end
|
35
63
|
end
|
@@ -1,16 +1,19 @@
|
|
1
1
|
module Treat
|
2
2
|
module Languages
|
3
3
|
class French
|
4
|
+
RequiredDependencies = []
|
5
|
+
OptionalDependencies = []
|
4
6
|
Extractors = {}
|
5
7
|
Inflectors = {}
|
6
8
|
Lexicalizers = {
|
7
|
-
tag
|
9
|
+
:tag => [:stanford],
|
10
|
+
:category => [:from_tag]
|
8
11
|
}
|
9
12
|
Processors = {
|
10
|
-
chunkers
|
11
|
-
parsers
|
12
|
-
segmenters
|
13
|
-
tokenizers
|
13
|
+
:chunkers => [:txt],
|
14
|
+
:parsers => [:stanford],
|
15
|
+
:segmenters => [:punkt],
|
16
|
+
:tokenizers => [:tactful]
|
14
17
|
}
|
15
18
|
end
|
16
19
|
end
|
@@ -1,17 +1,20 @@
|
|
1
1
|
module Treat
|
2
2
|
module Languages
|
3
3
|
class German
|
4
|
+
RequiredDependencies = []
|
5
|
+
OptionalDependencies = []
|
4
6
|
Extractors = {}
|
5
7
|
Inflectors = {}
|
6
8
|
Lexicalizers = {
|
7
|
-
tag
|
9
|
+
:tag => [:stanford],
|
10
|
+
:category => [:from_tag]
|
8
11
|
}
|
9
12
|
Processors = {
|
10
|
-
chunkers
|
11
|
-
parsers
|
12
|
-
segmenters
|
13
|
-
tokenizers
|
13
|
+
:chunkers => [:txt],
|
14
|
+
:parsers => [:stanford],
|
15
|
+
:segmenters => [:punkt],
|
16
|
+
:tokenizers => [:tactful]
|
14
17
|
}
|
15
18
|
end
|
16
19
|
end
|
17
|
-
end
|
20
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Treat
|
2
|
+
module Languages
|
3
|
+
class Greek
|
4
|
+
RequiredDependencies = []
|
5
|
+
OptionalDependencies = []
|
6
|
+
Processors = {
|
7
|
+
:chunkers => [:txt],
|
8
|
+
:segmenters => [:punkt],
|
9
|
+
:tokenizers => [:tactful]
|
10
|
+
}
|
11
|
+
Extractors = {}
|
12
|
+
Inflectors = {}
|
13
|
+
Lexicalizers = {}
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|