treat 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +1 -0
- data/README +3 -0
- data/TODO +14 -26
- data/bin/INFO +1 -1
- data/lib/treat/buildable.rb +10 -11
- data/lib/treat/categories.rb +8 -6
- data/lib/treat/category.rb +7 -2
- data/lib/treat/delegatable.rb +64 -56
- data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
- data/lib/treat/detectors/language/language_detector.rb +2 -1
- data/lib/treat/detectors/language/what_language.rb +2 -2
- data/lib/treat/detectors.rb +3 -0
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/entities.rb +9 -10
- data/lib/treat/exception.rb +3 -1
- data/lib/treat/extractors/named_entity/abner.rb +1 -1
- data/lib/treat/extractors/named_entity/stanford.rb +2 -2
- data/lib/treat/extractors/time/chronic.rb +2 -2
- data/lib/treat/extractors/time/nickel.rb +2 -2
- data/lib/treat/extractors/topic_words/lda.rb +2 -2
- data/lib/treat/extractors.rb +12 -9
- data/lib/treat/feature.rb +6 -1
- data/lib/treat/formatters/cleaners/html.rb +1 -1
- data/lib/treat/formatters.rb +8 -8
- data/lib/treat/group.rb +11 -10
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
- data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
- data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
- data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
- data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
- data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
- data/lib/treat/inflectors.rb +8 -21
- data/lib/treat/kernel.rb +120 -0
- data/lib/treat/languages/arabic.rb +14 -0
- data/lib/treat/languages/categories.rb +5 -0
- data/lib/treat/languages/chinese.rb +12 -0
- data/lib/treat/languages/english/categories.rb +23 -0
- data/lib/treat/{resources → languages/english}/tags.rb +127 -184
- data/lib/treat/languages/english.rb +33 -0
- data/lib/treat/languages/french.rb +17 -0
- data/lib/treat/languages/german.rb +17 -0
- data/lib/treat/languages/italian.rb +14 -0
- data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
- data/lib/treat/languages/xinhua.rb +12 -0
- data/lib/treat/languages.rb +91 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
- data/lib/treat/lexicalizers/tag/brill.rb +2 -1
- data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
- data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
- data/lib/treat/lexicalizers.rb +1 -1
- data/lib/treat/object.rb +6 -0
- data/lib/treat/processors/parsers/enju.rb +3 -2
- data/lib/treat/processors/parsers/stanford.rb +15 -12
- data/lib/treat/processors/segmenters/punkt.rb +1 -1
- data/lib/treat/processors/segmenters/stanford.rb +7 -5
- data/lib/treat/processors/segmenters/tactful.rb +1 -1
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
- data/lib/treat/processors/tokenizers/stanford.rb +7 -5
- data/lib/treat/visitable.rb +2 -1
- data/lib/treat.rb +105 -54
- data/test/tc_entity.rb +5 -0
- data/test/tc_resources.rb +5 -5
- data/test/tc_treat.rb +1 -2
- data/test/tests.rb +2 -1
- metadata +63 -64
- data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
- data/lib/treat/resources/categories.rb +0 -18
- data/lib/treat/resources/delegates.rb +0 -96
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +0 -8
- data/lib/treat/resources/formats.rb +0 -23
- data/lib/treat/resources/languages.rb +0 -86
- data/lib/treat/resources.rb +0 -10
- data/lib/treat/utilities.rb +0 -127
data/INSTALL
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
See the Wiki: https://github.com/louismullie/treat/wiki/Installing-Treat
|
data/README
CHANGED
data/TODO
CHANGED
@@ -1,23 +1,29 @@
|
|
1
|
-
|
1
|
+
## Urgent
|
2
2
|
|
3
|
-
- Linkers
|
3
|
+
- Linkers
|
4
4
|
- Check taggers for context
|
5
5
|
- Stanford dependencies parse
|
6
|
-
- Enju
|
6
|
+
- Enju: test
|
7
7
|
- Ocropus => use better function
|
8
|
-
- Optimize magic methods...
|
8
|
+
- Optimize magic methods... is_token? type methods, phrase categories.
|
9
9
|
- Move statistics?
|
10
10
|
- Synset class move
|
11
11
|
- general procedure for options, check that user doesn't want to change options...
|
12
|
-
-
|
12
|
+
- Languages: dependencies vs. edges, PTB function tags
|
13
13
|
- Check for # Fix everywhere
|
14
14
|
- Check paths; parse bin paths
|
15
15
|
- Ferret, Natural Inputs
|
16
16
|
- Use consistently delegate
|
17
17
|
- Text becomes section
|
18
|
+
- Remove top level
|
19
|
+
- Loading multiple JARs
|
20
|
+
- Tokenized sentences are not parsed
|
21
|
+
- Documentation
|
22
|
+
- Remove feature
|
18
23
|
|
19
|
-
|
24
|
+
## Eventually
|
20
25
|
|
26
|
+
- English inflector
|
21
27
|
- RDF output
|
22
28
|
- Apache OpenNLP
|
23
29
|
- Ariel
|
@@ -44,24 +50,6 @@
|
|
44
50
|
- Probabilistic features: rchardet19, what_language
|
45
51
|
- Enju multithreading ?
|
46
52
|
- String type detector for other languages
|
47
|
-
|
48
53
|
- Automatic benchmark
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
- Spell Cheker Raspell
|
53
|
-
- Multithreading
|
54
|
-
|
55
|
-
=== Checklist before releasing
|
56
|
-
|
57
|
-
- Remove code from main page
|
58
|
-
- Remove lib path from tests and main page
|
59
|
-
- Remove docs
|
60
|
-
- gem19 build treat.gemspec
|
61
|
-
|
62
|
-
=== Performance
|
63
|
-
|
64
|
-
- Cache results that get computed often
|
65
|
-
- Use .size == 0 instead of .empty?
|
66
|
-
- Optimize method_missing using define_method, even dynamically
|
67
|
-
- Array include is slow
|
54
|
+
- Raspell spell checker
|
55
|
+
- Multithreading
|
data/bin/INFO
CHANGED
@@ -1 +1 @@
|
|
1
|
-
This is where
|
1
|
+
This is where Treat will look for the Stanford JAR files by default. You can change this to another directory by setting Treat.bin = '/path/to/your/folder/' at runtime.
|
data/lib/treat/buildable.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Treat
|
2
|
+
# Represents an object that can be built
|
3
|
+
# from a folder of files, a specific file,
|
4
|
+
# a string or a numeric object. This class
|
5
|
+
# is pretty much self-explanatory.
|
2
6
|
module Buildable
|
3
|
-
|
4
7
|
def from_anything(file_or_value, id)
|
5
8
|
if File.readable?(file_or_value.to_s)
|
6
9
|
from_file(file_or_value)
|
@@ -9,11 +12,11 @@ module Treat
|
|
9
12
|
elsif file_or_value.is_a?(Numeric)
|
10
13
|
from_numeric(file_or_value)
|
11
14
|
else
|
12
|
-
raise
|
15
|
+
raise Treat::Exception,
|
16
|
+
"Unrecognizable input #{file_or_value}. "+
|
13
17
|
"Use filename, folder, text or a number."
|
14
18
|
end
|
15
19
|
end
|
16
|
-
|
17
20
|
def from_string(string)
|
18
21
|
if self == Treat::Entities::Document ||
|
19
22
|
self == Treat::Entities::Collection
|
@@ -37,7 +40,6 @@ module Treat
|
|
37
40
|
end
|
38
41
|
return Treat::Entities::Unknown.new(string)
|
39
42
|
end
|
40
|
-
|
41
43
|
def from_numeric(numeric)
|
42
44
|
unless self == Treat::Entities::Number
|
43
45
|
raise Treat::Exception,
|
@@ -46,7 +48,6 @@ module Treat
|
|
46
48
|
end
|
47
49
|
Treat::Entities::Number.new(numeric.to_s)
|
48
50
|
end
|
49
|
-
|
50
51
|
def from_folder(folder)
|
51
52
|
unless FileTest.directory?(folder)
|
52
53
|
raise Treat::Exception,
|
@@ -69,7 +70,6 @@ module Treat
|
|
69
70
|
end
|
70
71
|
c
|
71
72
|
end
|
72
|
-
|
73
73
|
def from_file(file)
|
74
74
|
unless File.readable?(file)
|
75
75
|
raise Treat::Exception,
|
@@ -79,7 +79,8 @@ module Treat
|
|
79
79
|
from_folder(file)
|
80
80
|
else
|
81
81
|
ext = file.split('.')[-1]
|
82
|
-
|
82
|
+
# Humanize the yaml extension.
|
83
|
+
ext = 'yaml' if ext == 'yml'
|
83
84
|
if Treat::Formatters::Unserializers.list.
|
84
85
|
include?(ext.downcase.intern)
|
85
86
|
from_serialized_file(file)
|
@@ -88,7 +89,6 @@ module Treat
|
|
88
89
|
end
|
89
90
|
end
|
90
91
|
end
|
91
|
-
|
92
92
|
def from_raw_file(file)
|
93
93
|
unless self == Treat::Entities::Document
|
94
94
|
raise Treat::Exception,
|
@@ -98,9 +98,9 @@ module Treat
|
|
98
98
|
d = Treat::Entities::Document.new(file)
|
99
99
|
d.read
|
100
100
|
end
|
101
|
-
|
102
101
|
def from_serialized_file(file)
|
103
|
-
unless [Treat::Entities::Document,
|
102
|
+
unless [Treat::Entities::Document,
|
103
|
+
Treat::Entities::Collection].include?(self)
|
104
104
|
raise Treat::Exception,
|
105
105
|
"Cannot create something else than a " +
|
106
106
|
"document from raw file '#{file}'."
|
@@ -110,6 +110,5 @@ module Treat
|
|
110
110
|
d.children[0].set_as_root!
|
111
111
|
d.children[0]
|
112
112
|
end
|
113
|
-
|
114
113
|
end
|
115
114
|
end
|
data/lib/treat/categories.rb
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
module Treat
|
2
|
+
# This module keeps track of all categories that
|
3
|
+
# exist and the methods they implement, and is
|
4
|
+
# responsible for including the categories.
|
2
5
|
module Categories
|
3
|
-
#
|
4
|
-
# setup autoload, delegators and provide a list
|
5
|
-
# of methods.
|
6
|
+
# A list of categories.
|
6
7
|
class << self; attr_accessor :list; end
|
7
8
|
self.list = []
|
8
|
-
# Boolean - does any of the categories
|
9
|
-
#
|
9
|
+
# Boolean - does any of the categories have
|
10
|
+
# a method that corresponds to sym?
|
10
11
|
def self.have_method?(sym); methods.include?(sym); end
|
12
|
+
# Cache the list of methods once it has been computed.
|
13
|
+
@@methods = []
|
11
14
|
# Provide a list of all methods implemented
|
12
15
|
# by all Treat categories.
|
13
|
-
@@methods = []
|
14
16
|
def self.methods
|
15
17
|
return @@methods unless @@methods.empty?
|
16
18
|
self.list.each do |ns|
|
data/lib/treat/category.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module Treat
|
2
|
-
#
|
2
|
+
# Clusters together groups of algorithms that
|
3
|
+
# perform similar functions.
|
3
4
|
module Category
|
5
|
+
# Require the Group class.
|
4
6
|
require 'treat/group'
|
7
|
+
# Add delegators to the Entities based on the
|
8
|
+
# configuration for a given category.
|
5
9
|
def self.extended(category)
|
6
10
|
Treat::Categories.list << category
|
7
11
|
category.module_eval do
|
@@ -14,9 +18,10 @@ module Treat
|
|
14
18
|
end
|
15
19
|
end
|
16
20
|
end
|
21
|
+
# Provides a list of groups within this category.
|
17
22
|
def groups; self.constants; end
|
18
23
|
# Provide a list of methods implemented in
|
19
|
-
# the groups contained within
|
24
|
+
# the groups contained within this category.
|
20
25
|
def methods
|
21
26
|
methods = []
|
22
27
|
groups.each do |group|
|
data/lib/treat/delegatable.rb
CHANGED
@@ -2,34 +2,8 @@ module Treat
|
|
2
2
|
# Makes a class delegatable, allowing calls on it to be forwarded
|
3
3
|
# to a delegate class performing the appropriate call.
|
4
4
|
module Delegatable
|
5
|
-
|
6
|
-
# Get the default delegate for that language
|
7
|
-
# inside the given group.
|
8
|
-
def get_language_delegate(language, group)
|
9
|
-
lang = Treat::Resources::Languages.describe(language)
|
10
|
-
lclass = cc(lang).intern
|
11
|
-
if Treat::Resources::Delegates.constants.include?(lclass)
|
12
|
-
cat = group.to_s.split('::')[-2].intern
|
13
|
-
lclass = Treat::Resources::Delegates.
|
14
|
-
const_get(lclass).const_get(cat)
|
15
|
-
g = ucc(cl(group)).intern
|
16
|
-
if !lclass[g] || !lclass[g][0]
|
17
|
-
d = ucc(cl(group))
|
18
|
-
d.gsub!('_', ' ')
|
19
|
-
d = d[0..-2] if d[-1] == 's'
|
20
|
-
d = 'delegator to find ' + d
|
21
|
-
raise Treat::Exception, "No #{d}" +
|
22
|
-
" is available for the #{lang} language."
|
23
|
-
end
|
24
|
-
return lclass[g][0]
|
25
|
-
else
|
26
|
-
raise Treat::Exception,
|
27
|
-
"Language '#{lang}' is not supported (yet)."
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
5
|
# Add decorator methods to entities.
|
32
|
-
def
|
6
|
+
def add_decorators(group, m)
|
33
7
|
decorators = group.methods -
|
34
8
|
Object.methods -
|
35
9
|
[:type, :type=, :targets, :targets=,
|
@@ -42,49 +16,83 @@ module Treat
|
|
42
16
|
end
|
43
17
|
end
|
44
18
|
end
|
45
|
-
|
46
|
-
# Raise an exception and suggest alternatives.
|
47
|
-
def delegate_not_found(klass, group)
|
48
|
-
"Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
|
49
|
-
did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
|
50
|
-
end
|
51
|
-
|
52
19
|
# Add delegator group to all entities of a class.
|
53
20
|
def add_delegators(group)
|
54
21
|
# Define each method in group.
|
55
22
|
self.class_eval do
|
56
23
|
m = group.method
|
57
|
-
|
24
|
+
add_decorators(group, m)
|
58
25
|
define_method(m) do |delegate=nil, options={}|
|
59
26
|
decorator = options.delete(:decorator)
|
60
27
|
puts self.id if !@features
|
61
28
|
if !@features[m].nil?
|
62
29
|
@features[m]
|
63
30
|
else
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
group.default
|
68
|
-
raise "No default delegate for #{group}." if delegate == :none
|
69
|
-
end
|
70
|
-
if not group.list.include?(delegate)
|
71
|
-
raise Treat::Exception,
|
72
|
-
self.class.delegate_not_found(delegate, group)
|
73
|
-
else
|
74
|
-
delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
|
75
|
-
result = accept(group, delegate_klass, m, options)
|
76
|
-
if decorator
|
77
|
-
result = group.send(decorator, self, result)
|
78
|
-
end
|
79
|
-
if group.type == :annotator
|
80
|
-
f = decorator.nil? ? m : decorator
|
81
|
-
@features[f] = result
|
82
|
-
end
|
83
|
-
result
|
84
|
-
end
|
31
|
+
self.class.call_delegator(
|
32
|
+
self, m, delegate, decorator,
|
33
|
+
group, options)
|
85
34
|
end
|
86
35
|
end
|
87
36
|
end
|
88
37
|
end
|
38
|
+
# Call a delegator.
|
39
|
+
def call_delegator(entity, m, delegate, decorator, group, options)
|
40
|
+
if delegate.nil?
|
41
|
+
delegate = get_missing_delegate(entity, group)
|
42
|
+
end
|
43
|
+
if not group.list.include?(delegate)
|
44
|
+
raise Treat::Exception, delegate_not_found(delegate, group)
|
45
|
+
else
|
46
|
+
delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
|
47
|
+
result = entity.accept(group, delegate_klass, m, options)
|
48
|
+
if decorator
|
49
|
+
result = group.send(decorator, self, result)
|
50
|
+
end
|
51
|
+
if group.type == :annotator
|
52
|
+
f = decorator.nil? ? m : decorator
|
53
|
+
entity.features[f] = result
|
54
|
+
end
|
55
|
+
result
|
56
|
+
end
|
57
|
+
end
|
58
|
+
# Get the default delegate for that language
|
59
|
+
# inside the given group.
|
60
|
+
def get_language_delegate(language, group)
|
61
|
+
lang = Treat::Languages.describe(language)
|
62
|
+
lclass = cc(lang).intern
|
63
|
+
if Treat::Languages.constants.include?(lclass)
|
64
|
+
cat = group.to_s.split('::')[-2].intern
|
65
|
+
lclass = Treat::Languages.const_get(lclass).const_get(cat)
|
66
|
+
g = ucc(cl(group)).intern
|
67
|
+
if !lclass[g] || !lclass[g][0]
|
68
|
+
d = ucc(cl(group))
|
69
|
+
d.gsub!('_', ' ')
|
70
|
+
d = d[0..-2] if d[-1] == 's'
|
71
|
+
d = 'delegator to find ' + d
|
72
|
+
raise Treat::Exception, "No #{d}" +
|
73
|
+
" is available for the #{lang} language."
|
74
|
+
end
|
75
|
+
return lclass[g][0]
|
76
|
+
else
|
77
|
+
raise Treat::Exception,
|
78
|
+
"Language '#{lang}' is not supported (yet)."
|
79
|
+
end
|
80
|
+
end
|
81
|
+
# Get which delegate to use if none has been supplied.
|
82
|
+
def get_missing_delegate(entity, group)
|
83
|
+
delegate = group.default.nil? ?
|
84
|
+
self.get_language_delegate(entity.language, group) :
|
85
|
+
group.default
|
86
|
+
if delegate == :none
|
87
|
+
raise NAT::Exception,
|
88
|
+
"There is intentionally no default delegate for #{group}."
|
89
|
+
end
|
90
|
+
delegate
|
91
|
+
end
|
92
|
+
# Return an error message and suggest possible typos.
|
93
|
+
def delegate_not_found(klass, group)
|
94
|
+
"Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
|
95
|
+
did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
|
96
|
+
end
|
89
97
|
end
|
90
98
|
end
|
@@ -9,7 +9,8 @@ module Treat
|
|
9
9
|
dlvl = Treat.language_detection_level
|
10
10
|
if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
|
11
11
|
entity.has_parent?
|
12
|
-
|
12
|
+
anc = entity.ancestor_with_type(dlvl)
|
13
|
+
return anc.language if anc
|
13
14
|
end
|
14
15
|
end
|
15
16
|
end
|
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
module Detectors
|
3
3
|
module Language
|
4
4
|
# Require the 'whatlanguage' gem.
|
5
|
-
|
5
|
+
silence_warnings { require 'whatlanguage' }
|
6
6
|
# Adaptor for the 'whatlanguage' gem, which
|
7
7
|
# performs probabilistic language detection.
|
8
8
|
class WhatLanguage < LanguageDetector
|
@@ -19,7 +19,7 @@ module Treat
|
|
19
19
|
all = @@wl.process_text(entity.to_s)
|
20
20
|
lang = {}
|
21
21
|
all.each do |k,v|
|
22
|
-
lang[Treat::
|
22
|
+
lang[Treat::Languages.find(k)] = v
|
23
23
|
end
|
24
24
|
Treat::Feature.new(lang).best
|
25
25
|
end
|
data/lib/treat/detectors.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Treat
|
2
2
|
# Detectors detect a specific meta-information about
|
3
3
|
# an entity, such as encoding, format and language.
|
4
|
+
#
|
5
|
+
# Detectors are language-independent, and thus they
|
6
|
+
# are default algorithms specified for each of them.
|
4
7
|
module Detectors
|
5
8
|
# Group for algorithms that detect encoding.
|
6
9
|
module Encoding
|
@@ -93,7 +93,7 @@ module Treat
|
|
93
93
|
# dispatches done by Ruby to improve performance.
|
94
94
|
def parse_magic_method(sym, *args, &block)
|
95
95
|
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
96
|
-
@@cats_regexp ||= "(#{Treat::
|
96
|
+
@@cats_regexp ||= "(#{Treat::Languages::English::Categories.join('|')})"
|
97
97
|
method = sym.to_s =~ /entities/ ?
|
98
98
|
sym.to_s.gsub('entities', 'entitys'):
|
99
99
|
method = sym.to_s
|
data/lib/treat/entities.rb
CHANGED
@@ -4,15 +4,14 @@ module Treat
|
|
4
4
|
#
|
5
5
|
# - Collection
|
6
6
|
# - Document
|
7
|
-
# - Text
|
8
7
|
# - Zone (a Section, Title, Paragraph, or List)
|
9
8
|
# - Sentence
|
10
9
|
# - Constituent (a Phrase or Clause)
|
11
10
|
# - Token (a Word, Number, Punctuation, or Symbol).
|
12
11
|
module Entities
|
13
|
-
# Require Entity first
|
14
|
-
# extend this class.
|
12
|
+
# Require Entity first.
|
15
13
|
require 'treat/entities/entity'
|
14
|
+
# Then require all possible entities.
|
16
15
|
require 'treat/entities/collection'
|
17
16
|
require 'treat/entities/document'
|
18
17
|
require 'treat/entities/text'
|
@@ -40,13 +39,13 @@ module Treat
|
|
40
39
|
# comparison of entity types.
|
41
40
|
def self.rank(type)
|
42
41
|
klass = Entities.const_get(cc(type))
|
43
|
-
|
44
|
-
return
|
45
|
-
return
|
46
|
-
return
|
47
|
-
return
|
48
|
-
return
|
49
|
-
return
|
42
|
+
compare = lambda { |a,b| a == b || a < b }
|
43
|
+
return 0 if compare.call(klass, Token)
|
44
|
+
return 1 if compare.call(klass, Constituent)
|
45
|
+
return 2 if compare.call(klass, Sentence)
|
46
|
+
return 4 if compare.call(klass, Document)
|
47
|
+
return 3 if compare.call(klass, Section)
|
48
|
+
return 5 if compare.call(klass, Collection)
|
50
49
|
end
|
51
50
|
end
|
52
51
|
end
|
data/lib/treat/exception.rb
CHANGED
@@ -3,7 +3,7 @@ module Treat
|
|
3
3
|
module NamedEntity
|
4
4
|
class Stanford
|
5
5
|
# Require the Ruby-Java bridge.
|
6
|
-
|
6
|
+
silence_warnings do
|
7
7
|
require 'rjb'
|
8
8
|
Rjb::load(nil, ['-Xms256M', '-Xmx1024M'])
|
9
9
|
Rjb::add_jar('/ruby/treat/bin/treat/treat.jar')
|
@@ -23,7 +23,7 @@ module Treat
|
|
23
23
|
properties.set_property('ner.model.3class', '/ruby/treat/bin/stanford/classifiers/all.3class.distsim.crf.ser.gz')
|
24
24
|
properties.set_property('ner.model.7class', '/ruby/treat/bin/stanford/classifiers/muc.7class.distsim.crf.ser.gz')
|
25
25
|
properties.set_property('ner.model.MISCclass', '/ruby/treat/bin/stanford/classifiers/conll.4class.distsim.crf.ser.gz')
|
26
|
-
properties.set_property('parser.model', '/ruby/treat/bin/
|
26
|
+
properties.set_property('parser.model', '/ruby/treat/bin/stanford-parser/grammar/englishPCFG.ser.gz')
|
27
27
|
silence_stream(STDOUT) do
|
28
28
|
pipeline = StanfordCoreNLP.new(properties)
|
29
29
|
end
|
@@ -2,9 +2,9 @@ module Treat
|
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
4
|
class Chronic
|
5
|
-
|
5
|
+
silence_warnings { require 'chronic' }
|
6
6
|
def self.time(entity, options = {})
|
7
|
-
|
7
|
+
silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
|
8
8
|
end
|
9
9
|
end
|
10
10
|
end
|
@@ -15,9 +15,9 @@ module Treat
|
|
15
15
|
=end
|
16
16
|
module Nickel
|
17
17
|
require 'date'
|
18
|
-
|
18
|
+
silence_warnings { require 'nickel' }
|
19
19
|
def self.time(entity, options = {})
|
20
|
-
n =
|
20
|
+
n = silence_warnings { ::Nickel.parse(entity.to_s) }
|
21
21
|
occ = n.occurrences[0]
|
22
22
|
# Find the words..
|
23
23
|
rec = occ.type.to_s.gsub('single', 'once').intern
|
@@ -11,12 +11,12 @@ module Treat
|
|
11
11
|
# Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
12
12
|
class LDA
|
13
13
|
# Require the lda-ruby gem.
|
14
|
-
|
14
|
+
silence_warnings { require 'lda-ruby' }
|
15
15
|
# Monkey patch the TextCorpus class to call it without
|
16
16
|
# having to create any files.
|
17
17
|
Lda::TextCorpus.class_eval do
|
18
18
|
# Ruby, Y U NO SHUT UP!
|
19
|
-
|
19
|
+
silence_warnings { undef :initialize }
|
20
20
|
# Redefine initialize to take in an array of texts.
|
21
21
|
def initialize(texts)
|
22
22
|
super(nil)
|
data/lib/treat/extractors.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
module Treat
|
2
2
|
# Extractors extract specific information out of texts.
|
3
3
|
module Extractors
|
4
|
-
# Extracts
|
5
|
-
#
|
4
|
+
# Extracts the time of an object and annotates it
|
5
|
+
# with specific information regarding time.
|
6
6
|
module Time
|
7
7
|
extend Group
|
8
|
-
self.type = :
|
8
|
+
self.type = :annotator
|
9
9
|
self.targets = [:word, :constituent, :symbol]
|
10
10
|
end
|
11
11
|
# Extract the topic from a text.
|
@@ -20,22 +20,25 @@ module Treat
|
|
20
20
|
self.type = :annotator
|
21
21
|
self.targets = [:collection, :document, :text, :zone, :sentence]
|
22
22
|
end
|
23
|
-
|
24
|
-
extend Group
|
25
|
-
self.type = :computer
|
26
|
-
self.targets = [:entity]
|
27
|
-
self.default = :none
|
28
|
-
end
|
23
|
+
# Extract named entities from texts.
|
29
24
|
module NamedEntity
|
30
25
|
extend Group
|
31
26
|
self.type = :computer
|
32
27
|
self.targets = [:entity]
|
33
28
|
end
|
29
|
+
# Extract the key sentences from a text.
|
34
30
|
module KeySentences
|
35
31
|
extend Group
|
36
32
|
self.type = :computer
|
37
33
|
self.targets = [:collection, :document, :text, :zone, :sentence]
|
38
34
|
end
|
35
|
+
# This module should be moved out of here ASAP.
|
36
|
+
module Statistics
|
37
|
+
extend Group
|
38
|
+
self.type = :computer
|
39
|
+
self.targets = [:entity]
|
40
|
+
self.default = :none
|
41
|
+
end
|
39
42
|
extend Treat::Category
|
40
43
|
end
|
41
44
|
end
|
data/lib/treat/feature.rb
CHANGED
@@ -1,4 +1,9 @@
|
|
1
1
|
module Treat
|
2
|
+
# This class represents a probabilistic feature;
|
3
|
+
# it is currently not used, because its
|
4
|
+
# behaviour is non-deterministic. Perhaps at
|
5
|
+
# some point this will be of value for specific
|
6
|
+
# algorithms and so I'm keeping it here.
|
2
7
|
class Feature
|
3
8
|
# Undefine all methods, except those that
|
4
9
|
# create any problems (e.g. with serializing).
|
@@ -26,7 +31,7 @@ module Treat
|
|
26
31
|
end
|
27
32
|
end
|
28
33
|
# Normalize the probabilities, so that
|
29
|
-
# the sum of all probabilities is
|
34
|
+
# the sum of all probabilities is 1,
|
30
35
|
# except if the sum of all probabilities
|
31
36
|
# is already below one (in which case we
|
32
37
|
# assume that the feature is intentionally
|
data/lib/treat/formatters.rb
CHANGED
@@ -10,6 +10,13 @@ module Treat
|
|
10
10
|
self.targets = [:collection, :document]
|
11
11
|
self.default = :autoselect
|
12
12
|
end
|
13
|
+
# Serializers transform entities into a storable format.
|
14
|
+
module Serializers
|
15
|
+
extend Group
|
16
|
+
self.type = :computer
|
17
|
+
self.targets = [:entity]
|
18
|
+
self.default = :yaml
|
19
|
+
end
|
13
20
|
# Unserializers recreate entities from a serialized format.
|
14
21
|
module Unserializers
|
15
22
|
extend Group
|
@@ -24,14 +31,7 @@ module Treat
|
|
24
31
|
self.targets = [:entity]
|
25
32
|
self.default = :tree
|
26
33
|
end
|
27
|
-
#
|
28
|
-
module Serializers
|
29
|
-
extend Group
|
30
|
-
self.type = :computer
|
31
|
-
self.targets = [:entity]
|
32
|
-
self.default = :yaml
|
33
|
-
end
|
34
|
-
# Serializers transform entities into a storable format.
|
34
|
+
# Cleaners strip a text from its mark up.
|
35
35
|
module Cleaners
|
36
36
|
extend Group
|
37
37
|
self.type = :annotator
|