treat 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/INSTALL +1 -0
- data/README +3 -0
- data/TODO +14 -26
- data/bin/INFO +1 -1
- data/lib/treat/buildable.rb +10 -11
- data/lib/treat/categories.rb +8 -6
- data/lib/treat/category.rb +7 -2
- data/lib/treat/delegatable.rb +64 -56
- data/lib/treat/detectors/encoding/r_chardet19.rb +1 -1
- data/lib/treat/detectors/language/language_detector.rb +2 -1
- data/lib/treat/detectors/language/what_language.rb +2 -2
- data/lib/treat/detectors.rb +3 -0
- data/lib/treat/entities/entity.rb +1 -1
- data/lib/treat/entities.rb +9 -10
- data/lib/treat/exception.rb +3 -1
- data/lib/treat/extractors/named_entity/abner.rb +1 -1
- data/lib/treat/extractors/named_entity/stanford.rb +2 -2
- data/lib/treat/extractors/time/chronic.rb +2 -2
- data/lib/treat/extractors/time/nickel.rb +2 -2
- data/lib/treat/extractors/topic_words/lda.rb +2 -2
- data/lib/treat/extractors.rb +12 -9
- data/lib/treat/feature.rb +6 -1
- data/lib/treat/formatters/cleaners/html.rb +1 -1
- data/lib/treat/formatters.rb +8 -8
- data/lib/treat/group.rb +11 -10
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/{conjugators → conjugations}/linguistics.rb +6 -6
- data/lib/treat/inflectors/{declensors → declensions}/en.rb +2 -2
- data/lib/treat/inflectors/{declensors → declensions}/linguistics.rb +5 -5
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +4 -4
- data/lib/treat/inflectors/{stemmers → stem}/porter.rb +1 -1
- data/lib/treat/inflectors/{stemmers → stem}/porter_c.rb +3 -3
- data/lib/treat/inflectors/{stemmers → stem}/uea.rb +3 -3
- data/lib/treat/inflectors.rb +8 -21
- data/lib/treat/kernel.rb +120 -0
- data/lib/treat/languages/arabic.rb +14 -0
- data/lib/treat/languages/categories.rb +5 -0
- data/lib/treat/languages/chinese.rb +12 -0
- data/lib/treat/languages/english/categories.rb +23 -0
- data/lib/treat/{resources → languages/english}/tags.rb +127 -184
- data/lib/treat/languages/english.rb +33 -0
- data/lib/treat/languages/french.rb +17 -0
- data/lib/treat/languages/german.rb +17 -0
- data/lib/treat/languages/italian.rb +14 -0
- data/lib/treat/{resources/languages.txt → languages/list.txt} +0 -0
- data/lib/treat/languages/xinhua.rb +12 -0
- data/lib/treat/languages.rb +91 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +20 -8
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +1 -1
- data/lib/treat/lexicalizers/tag/brill.rb +2 -1
- data/lib/treat/lexicalizers/tag/lingua.rb +2 -1
- data/lib/treat/lexicalizers/tag/stanford.rb +16 -15
- data/lib/treat/lexicalizers.rb +1 -1
- data/lib/treat/object.rb +6 -0
- data/lib/treat/processors/parsers/enju.rb +3 -2
- data/lib/treat/processors/parsers/stanford.rb +15 -12
- data/lib/treat/processors/segmenters/punkt.rb +1 -1
- data/lib/treat/processors/segmenters/stanford.rb +7 -5
- data/lib/treat/processors/segmenters/tactful.rb +1 -1
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -2
- data/lib/treat/processors/tokenizers/stanford.rb +7 -5
- data/lib/treat/visitable.rb +2 -1
- data/lib/treat.rb +105 -54
- data/test/tc_entity.rb +5 -0
- data/test/tc_resources.rb +5 -5
- data/test/tc_treat.rb +1 -2
- data/test/tests.rb +2 -1
- metadata +63 -64
- data/lib/treat/formatters/serializers/yaml/helper.rb +0 -96
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +0 -213
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +0 -68
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +0 -6
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +0 -12
- data/lib/treat/resources/categories.rb +0 -18
- data/lib/treat/resources/delegates.rb +0 -96
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +0 -8
- data/lib/treat/resources/formats.rb +0 -23
- data/lib/treat/resources/languages.rb +0 -86
- data/lib/treat/resources.rb +0 -10
- data/lib/treat/utilities.rb +0 -127
data/INSTALL
CHANGED
@@ -0,0 +1 @@
|
|
1
|
+
See the Wiki: https://github.com/louismullie/treat/wiki/Installing-Treat
|
data/README
CHANGED
data/TODO
CHANGED
@@ -1,23 +1,29 @@
|
|
1
|
-
|
1
|
+
## Urgent
|
2
2
|
|
3
|
-
- Linkers
|
3
|
+
- Linkers
|
4
4
|
- Check taggers for context
|
5
5
|
- Stanford dependencies parse
|
6
|
-
- Enju
|
6
|
+
- Enju: test
|
7
7
|
- Ocropus => use better function
|
8
|
-
- Optimize magic methods...
|
8
|
+
- Optimize magic methods... is_token? type methods, phrase categories.
|
9
9
|
- Move statistics?
|
10
10
|
- Synset class move
|
11
11
|
- general procedure for options, check that user doesn't want to change options...
|
12
|
-
-
|
12
|
+
- Languages: dependencies vs. edges, PTB function tags
|
13
13
|
- Check for # Fix everywhere
|
14
14
|
- Check paths; parse bin paths
|
15
15
|
- Ferret, Natural Inputs
|
16
16
|
- Use consistently delegate
|
17
17
|
- Text becomes section
|
18
|
+
- Remove top level
|
19
|
+
- Loading multiple JARs
|
20
|
+
- Tokenized sentences are not parsed
|
21
|
+
- Documentation
|
22
|
+
- Remove feature
|
18
23
|
|
19
|
-
|
24
|
+
## Eventually
|
20
25
|
|
26
|
+
- English inflector
|
21
27
|
- RDF output
|
22
28
|
- Apache OpenNLP
|
23
29
|
- Ariel
|
@@ -44,24 +50,6 @@
|
|
44
50
|
- Probabilistic features: rchardet19, what_language
|
45
51
|
- Enju multithreading ?
|
46
52
|
- String type detector for other languages
|
47
|
-
|
48
53
|
- Automatic benchmark
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
- Spell Cheker Raspell
|
53
|
-
- Multithreading
|
54
|
-
|
55
|
-
=== Checklist before releasing
|
56
|
-
|
57
|
-
- Remove code from main page
|
58
|
-
- Remove lib path from tests and main page
|
59
|
-
- Remove docs
|
60
|
-
- gem19 build treat.gemspec
|
61
|
-
|
62
|
-
=== Performance
|
63
|
-
|
64
|
-
- Cache results that get computed often
|
65
|
-
- Use .size == 0 instead of .empty?
|
66
|
-
- Optimize method_missing using define_method, even dynamically
|
67
|
-
- Array include is slow
|
54
|
+
- Raspell spell checker
|
55
|
+
- Multithreading
|
data/bin/INFO
CHANGED
@@ -1 +1 @@
|
|
1
|
-
This is where
|
1
|
+
This is where Treat will look for the Stanford JAR files by default. You can change this to another directory by setting Treat.bin = '/path/to/your/folder/' at runtime.
|
data/lib/treat/buildable.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Treat
|
2
|
+
# Represents an object that can be built
|
3
|
+
# from a folder of files, a specific file,
|
4
|
+
# a string or a numeric object. This class
|
5
|
+
# is pretty much self-explanatory.
|
2
6
|
module Buildable
|
3
|
-
|
4
7
|
def from_anything(file_or_value, id)
|
5
8
|
if File.readable?(file_or_value.to_s)
|
6
9
|
from_file(file_or_value)
|
@@ -9,11 +12,11 @@ module Treat
|
|
9
12
|
elsif file_or_value.is_a?(Numeric)
|
10
13
|
from_numeric(file_or_value)
|
11
14
|
else
|
12
|
-
raise
|
15
|
+
raise Treat::Exception,
|
16
|
+
"Unrecognizable input #{file_or_value}. "+
|
13
17
|
"Use filename, folder, text or a number."
|
14
18
|
end
|
15
19
|
end
|
16
|
-
|
17
20
|
def from_string(string)
|
18
21
|
if self == Treat::Entities::Document ||
|
19
22
|
self == Treat::Entities::Collection
|
@@ -37,7 +40,6 @@ module Treat
|
|
37
40
|
end
|
38
41
|
return Treat::Entities::Unknown.new(string)
|
39
42
|
end
|
40
|
-
|
41
43
|
def from_numeric(numeric)
|
42
44
|
unless self == Treat::Entities::Number
|
43
45
|
raise Treat::Exception,
|
@@ -46,7 +48,6 @@ module Treat
|
|
46
48
|
end
|
47
49
|
Treat::Entities::Number.new(numeric.to_s)
|
48
50
|
end
|
49
|
-
|
50
51
|
def from_folder(folder)
|
51
52
|
unless FileTest.directory?(folder)
|
52
53
|
raise Treat::Exception,
|
@@ -69,7 +70,6 @@ module Treat
|
|
69
70
|
end
|
70
71
|
c
|
71
72
|
end
|
72
|
-
|
73
73
|
def from_file(file)
|
74
74
|
unless File.readable?(file)
|
75
75
|
raise Treat::Exception,
|
@@ -79,7 +79,8 @@ module Treat
|
|
79
79
|
from_folder(file)
|
80
80
|
else
|
81
81
|
ext = file.split('.')[-1]
|
82
|
-
|
82
|
+
# Humanize the yaml extension.
|
83
|
+
ext = 'yaml' if ext == 'yml'
|
83
84
|
if Treat::Formatters::Unserializers.list.
|
84
85
|
include?(ext.downcase.intern)
|
85
86
|
from_serialized_file(file)
|
@@ -88,7 +89,6 @@ module Treat
|
|
88
89
|
end
|
89
90
|
end
|
90
91
|
end
|
91
|
-
|
92
92
|
def from_raw_file(file)
|
93
93
|
unless self == Treat::Entities::Document
|
94
94
|
raise Treat::Exception,
|
@@ -98,9 +98,9 @@ module Treat
|
|
98
98
|
d = Treat::Entities::Document.new(file)
|
99
99
|
d.read
|
100
100
|
end
|
101
|
-
|
102
101
|
def from_serialized_file(file)
|
103
|
-
unless [Treat::Entities::Document,
|
102
|
+
unless [Treat::Entities::Document,
|
103
|
+
Treat::Entities::Collection].include?(self)
|
104
104
|
raise Treat::Exception,
|
105
105
|
"Cannot create something else than a " +
|
106
106
|
"document from raw file '#{file}'."
|
@@ -110,6 +110,5 @@ module Treat
|
|
110
110
|
d.children[0].set_as_root!
|
111
111
|
d.children[0]
|
112
112
|
end
|
113
|
-
|
114
113
|
end
|
115
114
|
end
|
data/lib/treat/categories.rb
CHANGED
@@ -1,16 +1,18 @@
|
|
1
1
|
module Treat
|
2
|
+
# This module keeps track of all categories that
|
3
|
+
# exist and the methods they implement, and is
|
4
|
+
# responsible for including the categories.
|
2
5
|
module Categories
|
3
|
-
#
|
4
|
-
# setup autoload, delegators and provide a list
|
5
|
-
# of methods.
|
6
|
+
# A list of categories.
|
6
7
|
class << self; attr_accessor :list; end
|
7
8
|
self.list = []
|
8
|
-
# Boolean - does any of the categories
|
9
|
-
#
|
9
|
+
# Boolean - does any of the categories have
|
10
|
+
# a method that corresponds to sym?
|
10
11
|
def self.have_method?(sym); methods.include?(sym); end
|
12
|
+
# Cache the list of methods once it has been computed.
|
13
|
+
@@methods = []
|
11
14
|
# Provide a list of all methods implemented
|
12
15
|
# by all Treat categories.
|
13
|
-
@@methods = []
|
14
16
|
def self.methods
|
15
17
|
return @@methods unless @@methods.empty?
|
16
18
|
self.list.each do |ns|
|
data/lib/treat/category.rb
CHANGED
@@ -1,7 +1,11 @@
|
|
1
1
|
module Treat
|
2
|
-
#
|
2
|
+
# Clusters together groups of algorithms that
|
3
|
+
# perform similar functions.
|
3
4
|
module Category
|
5
|
+
# Require the Group class.
|
4
6
|
require 'treat/group'
|
7
|
+
# Add delegators to the Entities based on the
|
8
|
+
# configuration for a given category.
|
5
9
|
def self.extended(category)
|
6
10
|
Treat::Categories.list << category
|
7
11
|
category.module_eval do
|
@@ -14,9 +18,10 @@ module Treat
|
|
14
18
|
end
|
15
19
|
end
|
16
20
|
end
|
21
|
+
# Provides a list of groups within this category.
|
17
22
|
def groups; self.constants; end
|
18
23
|
# Provide a list of methods implemented in
|
19
|
-
# the groups contained within
|
24
|
+
# the groups contained within this category.
|
20
25
|
def methods
|
21
26
|
methods = []
|
22
27
|
groups.each do |group|
|
data/lib/treat/delegatable.rb
CHANGED
@@ -2,34 +2,8 @@ module Treat
|
|
2
2
|
# Makes a class delegatable, allowing calls on it to be forwarded
|
3
3
|
# to a delegate class performing the appropriate call.
|
4
4
|
module Delegatable
|
5
|
-
|
6
|
-
# Get the default delegate for that language
|
7
|
-
# inside the given group.
|
8
|
-
def get_language_delegate(language, group)
|
9
|
-
lang = Treat::Resources::Languages.describe(language)
|
10
|
-
lclass = cc(lang).intern
|
11
|
-
if Treat::Resources::Delegates.constants.include?(lclass)
|
12
|
-
cat = group.to_s.split('::')[-2].intern
|
13
|
-
lclass = Treat::Resources::Delegates.
|
14
|
-
const_get(lclass).const_get(cat)
|
15
|
-
g = ucc(cl(group)).intern
|
16
|
-
if !lclass[g] || !lclass[g][0]
|
17
|
-
d = ucc(cl(group))
|
18
|
-
d.gsub!('_', ' ')
|
19
|
-
d = d[0..-2] if d[-1] == 's'
|
20
|
-
d = 'delegator to find ' + d
|
21
|
-
raise Treat::Exception, "No #{d}" +
|
22
|
-
" is available for the #{lang} language."
|
23
|
-
end
|
24
|
-
return lclass[g][0]
|
25
|
-
else
|
26
|
-
raise Treat::Exception,
|
27
|
-
"Language '#{lang}' is not supported (yet)."
|
28
|
-
end
|
29
|
-
end
|
30
|
-
|
31
5
|
# Add decorator methods to entities.
|
32
|
-
def
|
6
|
+
def add_decorators(group, m)
|
33
7
|
decorators = group.methods -
|
34
8
|
Object.methods -
|
35
9
|
[:type, :type=, :targets, :targets=,
|
@@ -42,49 +16,83 @@ module Treat
|
|
42
16
|
end
|
43
17
|
end
|
44
18
|
end
|
45
|
-
|
46
|
-
# Raise an exception and suggest alternatives.
|
47
|
-
def delegate_not_found(klass, group)
|
48
|
-
"Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
|
49
|
-
did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
|
50
|
-
end
|
51
|
-
|
52
19
|
# Add delegator group to all entities of a class.
|
53
20
|
def add_delegators(group)
|
54
21
|
# Define each method in group.
|
55
22
|
self.class_eval do
|
56
23
|
m = group.method
|
57
|
-
|
24
|
+
add_decorators(group, m)
|
58
25
|
define_method(m) do |delegate=nil, options={}|
|
59
26
|
decorator = options.delete(:decorator)
|
60
27
|
puts self.id if !@features
|
61
28
|
if !@features[m].nil?
|
62
29
|
@features[m]
|
63
30
|
else
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
group.default
|
68
|
-
raise "No default delegate for #{group}." if delegate == :none
|
69
|
-
end
|
70
|
-
if not group.list.include?(delegate)
|
71
|
-
raise Treat::Exception,
|
72
|
-
self.class.delegate_not_found(delegate, group)
|
73
|
-
else
|
74
|
-
delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
|
75
|
-
result = accept(group, delegate_klass, m, options)
|
76
|
-
if decorator
|
77
|
-
result = group.send(decorator, self, result)
|
78
|
-
end
|
79
|
-
if group.type == :annotator
|
80
|
-
f = decorator.nil? ? m : decorator
|
81
|
-
@features[f] = result
|
82
|
-
end
|
83
|
-
result
|
84
|
-
end
|
31
|
+
self.class.call_delegator(
|
32
|
+
self, m, delegate, decorator,
|
33
|
+
group, options)
|
85
34
|
end
|
86
35
|
end
|
87
36
|
end
|
88
37
|
end
|
38
|
+
# Call a delegator.
|
39
|
+
def call_delegator(entity, m, delegate, decorator, group, options)
|
40
|
+
if delegate.nil?
|
41
|
+
delegate = get_missing_delegate(entity, group)
|
42
|
+
end
|
43
|
+
if not group.list.include?(delegate)
|
44
|
+
raise Treat::Exception, delegate_not_found(delegate, group)
|
45
|
+
else
|
46
|
+
delegate_klass = group.const_get(:"#{cc(delegate.to_s)}")
|
47
|
+
result = entity.accept(group, delegate_klass, m, options)
|
48
|
+
if decorator
|
49
|
+
result = group.send(decorator, self, result)
|
50
|
+
end
|
51
|
+
if group.type == :annotator
|
52
|
+
f = decorator.nil? ? m : decorator
|
53
|
+
entity.features[f] = result
|
54
|
+
end
|
55
|
+
result
|
56
|
+
end
|
57
|
+
end
|
58
|
+
# Get the default delegate for that language
|
59
|
+
# inside the given group.
|
60
|
+
def get_language_delegate(language, group)
|
61
|
+
lang = Treat::Languages.describe(language)
|
62
|
+
lclass = cc(lang).intern
|
63
|
+
if Treat::Languages.constants.include?(lclass)
|
64
|
+
cat = group.to_s.split('::')[-2].intern
|
65
|
+
lclass = Treat::Languages.const_get(lclass).const_get(cat)
|
66
|
+
g = ucc(cl(group)).intern
|
67
|
+
if !lclass[g] || !lclass[g][0]
|
68
|
+
d = ucc(cl(group))
|
69
|
+
d.gsub!('_', ' ')
|
70
|
+
d = d[0..-2] if d[-1] == 's'
|
71
|
+
d = 'delegator to find ' + d
|
72
|
+
raise Treat::Exception, "No #{d}" +
|
73
|
+
" is available for the #{lang} language."
|
74
|
+
end
|
75
|
+
return lclass[g][0]
|
76
|
+
else
|
77
|
+
raise Treat::Exception,
|
78
|
+
"Language '#{lang}' is not supported (yet)."
|
79
|
+
end
|
80
|
+
end
|
81
|
+
# Get which delegate to use if none has been supplied.
|
82
|
+
def get_missing_delegate(entity, group)
|
83
|
+
delegate = group.default.nil? ?
|
84
|
+
self.get_language_delegate(entity.language, group) :
|
85
|
+
group.default
|
86
|
+
if delegate == :none
|
87
|
+
raise NAT::Exception,
|
88
|
+
"There is intentionally no default delegate for #{group}."
|
89
|
+
end
|
90
|
+
delegate
|
91
|
+
end
|
92
|
+
# Return an error message and suggest possible typos.
|
93
|
+
def delegate_not_found(klass, group)
|
94
|
+
"Algorithm '#{ucc(klass)}' couldn't be found in group #{group}." +
|
95
|
+
did_you_mean?(group.list.map { |c| ucc(c) }, ucc(klass))
|
96
|
+
end
|
89
97
|
end
|
90
98
|
end
|
@@ -9,7 +9,8 @@ module Treat
|
|
9
9
|
dlvl = Treat.language_detection_level
|
10
10
|
if (Entities.rank(entity.type) < Entities.rank(dlvl)) &&
|
11
11
|
entity.has_parent?
|
12
|
-
|
12
|
+
anc = entity.ancestor_with_type(dlvl)
|
13
|
+
return anc.language if anc
|
13
14
|
end
|
14
15
|
end
|
15
16
|
end
|
@@ -2,7 +2,7 @@ module Treat
|
|
2
2
|
module Detectors
|
3
3
|
module Language
|
4
4
|
# Require the 'whatlanguage' gem.
|
5
|
-
|
5
|
+
silence_warnings { require 'whatlanguage' }
|
6
6
|
# Adaptor for the 'whatlanguage' gem, which
|
7
7
|
# performs probabilistic language detection.
|
8
8
|
class WhatLanguage < LanguageDetector
|
@@ -19,7 +19,7 @@ module Treat
|
|
19
19
|
all = @@wl.process_text(entity.to_s)
|
20
20
|
lang = {}
|
21
21
|
all.each do |k,v|
|
22
|
-
lang[Treat::
|
22
|
+
lang[Treat::Languages.find(k)] = v
|
23
23
|
end
|
24
24
|
Treat::Feature.new(lang).best
|
25
25
|
end
|
data/lib/treat/detectors.rb
CHANGED
@@ -1,6 +1,9 @@
|
|
1
1
|
module Treat
|
2
2
|
# Detectors detect a specific meta-information about
|
3
3
|
# an entity, such as encoding, format and language.
|
4
|
+
#
|
5
|
+
# Detectors are language-independent, and thus they
|
6
|
+
# are default algorithms specified for each of them.
|
4
7
|
module Detectors
|
5
8
|
# Group for algorithms that detect encoding.
|
6
9
|
module Encoding
|
@@ -93,7 +93,7 @@ module Treat
|
|
93
93
|
# dispatches done by Ruby to improve performance.
|
94
94
|
def parse_magic_method(sym, *args, &block)
|
95
95
|
@@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
|
96
|
-
@@cats_regexp ||= "(#{Treat::
|
96
|
+
@@cats_regexp ||= "(#{Treat::Languages::English::Categories.join('|')})"
|
97
97
|
method = sym.to_s =~ /entities/ ?
|
98
98
|
sym.to_s.gsub('entities', 'entitys'):
|
99
99
|
method = sym.to_s
|
data/lib/treat/entities.rb
CHANGED
@@ -4,15 +4,14 @@ module Treat
|
|
4
4
|
#
|
5
5
|
# - Collection
|
6
6
|
# - Document
|
7
|
-
# - Text
|
8
7
|
# - Zone (a Section, Title, Paragraph, or List)
|
9
8
|
# - Sentence
|
10
9
|
# - Constituent (a Phrase or Clause)
|
11
10
|
# - Token (a Word, Number, Punctuation, or Symbol).
|
12
11
|
module Entities
|
13
|
-
# Require Entity first
|
14
|
-
# extend this class.
|
12
|
+
# Require Entity first.
|
15
13
|
require 'treat/entities/entity'
|
14
|
+
# Then require all possible entities.
|
16
15
|
require 'treat/entities/collection'
|
17
16
|
require 'treat/entities/document'
|
18
17
|
require 'treat/entities/text'
|
@@ -40,13 +39,13 @@ module Treat
|
|
40
39
|
# comparison of entity types.
|
41
40
|
def self.rank(type)
|
42
41
|
klass = Entities.const_get(cc(type))
|
43
|
-
|
44
|
-
return
|
45
|
-
return
|
46
|
-
return
|
47
|
-
return
|
48
|
-
return
|
49
|
-
return
|
42
|
+
compare = lambda { |a,b| a == b || a < b }
|
43
|
+
return 0 if compare.call(klass, Token)
|
44
|
+
return 1 if compare.call(klass, Constituent)
|
45
|
+
return 2 if compare.call(klass, Sentence)
|
46
|
+
return 4 if compare.call(klass, Document)
|
47
|
+
return 3 if compare.call(klass, Section)
|
48
|
+
return 5 if compare.call(klass, Collection)
|
50
49
|
end
|
51
50
|
end
|
52
51
|
end
|
data/lib/treat/exception.rb
CHANGED
@@ -3,7 +3,7 @@ module Treat
|
|
3
3
|
module NamedEntity
|
4
4
|
class Stanford
|
5
5
|
# Require the Ruby-Java bridge.
|
6
|
-
|
6
|
+
silence_warnings do
|
7
7
|
require 'rjb'
|
8
8
|
Rjb::load(nil, ['-Xms256M', '-Xmx1024M'])
|
9
9
|
Rjb::add_jar('/ruby/treat/bin/treat/treat.jar')
|
@@ -23,7 +23,7 @@ module Treat
|
|
23
23
|
properties.set_property('ner.model.3class', '/ruby/treat/bin/stanford/classifiers/all.3class.distsim.crf.ser.gz')
|
24
24
|
properties.set_property('ner.model.7class', '/ruby/treat/bin/stanford/classifiers/muc.7class.distsim.crf.ser.gz')
|
25
25
|
properties.set_property('ner.model.MISCclass', '/ruby/treat/bin/stanford/classifiers/conll.4class.distsim.crf.ser.gz')
|
26
|
-
properties.set_property('parser.model', '/ruby/treat/bin/
|
26
|
+
properties.set_property('parser.model', '/ruby/treat/bin/stanford-parser/grammar/englishPCFG.ser.gz')
|
27
27
|
silence_stream(STDOUT) do
|
28
28
|
pipeline = StanfordCoreNLP.new(properties)
|
29
29
|
end
|
@@ -2,9 +2,9 @@ module Treat
|
|
2
2
|
module Extractors
|
3
3
|
module Time
|
4
4
|
class Chronic
|
5
|
-
|
5
|
+
silence_warnings { require 'chronic' }
|
6
6
|
def self.time(entity, options = {})
|
7
|
-
|
7
|
+
silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
|
8
8
|
end
|
9
9
|
end
|
10
10
|
end
|
@@ -15,9 +15,9 @@ module Treat
|
|
15
15
|
=end
|
16
16
|
module Nickel
|
17
17
|
require 'date'
|
18
|
-
|
18
|
+
silence_warnings { require 'nickel' }
|
19
19
|
def self.time(entity, options = {})
|
20
|
-
n =
|
20
|
+
n = silence_warnings { ::Nickel.parse(entity.to_s) }
|
21
21
|
occ = n.occurrences[0]
|
22
22
|
# Find the words..
|
23
23
|
rec = occ.type.to_s.gsub('single', 'once').intern
|
@@ -11,12 +11,12 @@ module Treat
|
|
11
11
|
# Machine Learning Research. 3 (Mar. 2003), 993-1022.
|
12
12
|
class LDA
|
13
13
|
# Require the lda-ruby gem.
|
14
|
-
|
14
|
+
silence_warnings { require 'lda-ruby' }
|
15
15
|
# Monkey patch the TextCorpus class to call it without
|
16
16
|
# having to create any files.
|
17
17
|
Lda::TextCorpus.class_eval do
|
18
18
|
# Ruby, Y U NO SHUT UP!
|
19
|
-
|
19
|
+
silence_warnings { undef :initialize }
|
20
20
|
# Redefine initialize to take in an array of texts.
|
21
21
|
def initialize(texts)
|
22
22
|
super(nil)
|
data/lib/treat/extractors.rb
CHANGED
@@ -1,11 +1,11 @@
|
|
1
1
|
module Treat
|
2
2
|
# Extractors extract specific information out of texts.
|
3
3
|
module Extractors
|
4
|
-
# Extracts
|
5
|
-
#
|
4
|
+
# Extracts the time of an object and annotates it
|
5
|
+
# with specific information regarding time.
|
6
6
|
module Time
|
7
7
|
extend Group
|
8
|
-
self.type = :
|
8
|
+
self.type = :annotator
|
9
9
|
self.targets = [:word, :constituent, :symbol]
|
10
10
|
end
|
11
11
|
# Extract the topic from a text.
|
@@ -20,22 +20,25 @@ module Treat
|
|
20
20
|
self.type = :annotator
|
21
21
|
self.targets = [:collection, :document, :text, :zone, :sentence]
|
22
22
|
end
|
23
|
-
|
24
|
-
extend Group
|
25
|
-
self.type = :computer
|
26
|
-
self.targets = [:entity]
|
27
|
-
self.default = :none
|
28
|
-
end
|
23
|
+
# Extract named entities from texts.
|
29
24
|
module NamedEntity
|
30
25
|
extend Group
|
31
26
|
self.type = :computer
|
32
27
|
self.targets = [:entity]
|
33
28
|
end
|
29
|
+
# Extract the key sentences from a text.
|
34
30
|
module KeySentences
|
35
31
|
extend Group
|
36
32
|
self.type = :computer
|
37
33
|
self.targets = [:collection, :document, :text, :zone, :sentence]
|
38
34
|
end
|
35
|
+
# This module should be moved out of here ASAP.
|
36
|
+
module Statistics
|
37
|
+
extend Group
|
38
|
+
self.type = :computer
|
39
|
+
self.targets = [:entity]
|
40
|
+
self.default = :none
|
41
|
+
end
|
39
42
|
extend Treat::Category
|
40
43
|
end
|
41
44
|
end
|
data/lib/treat/feature.rb
CHANGED
@@ -1,4 +1,9 @@
|
|
1
1
|
module Treat
|
2
|
+
# This class represents a probabilistic feature;
|
3
|
+
# it is currently not used, because its
|
4
|
+
# behaviour is non-deterministic. Perhaps at
|
5
|
+
# some point this will be of value for specific
|
6
|
+
# algorithms and so I'm keeping it here.
|
2
7
|
class Feature
|
3
8
|
# Undefine all methods, except those that
|
4
9
|
# create any problems (e.g. with serializing).
|
@@ -26,7 +31,7 @@ module Treat
|
|
26
31
|
end
|
27
32
|
end
|
28
33
|
# Normalize the probabilities, so that
|
29
|
-
# the sum of all probabilities is
|
34
|
+
# the sum of all probabilities is 1,
|
30
35
|
# except if the sum of all probabilities
|
31
36
|
# is already below one (in which case we
|
32
37
|
# assume that the feature is intentionally
|
data/lib/treat/formatters.rb
CHANGED
@@ -10,6 +10,13 @@ module Treat
|
|
10
10
|
self.targets = [:collection, :document]
|
11
11
|
self.default = :autoselect
|
12
12
|
end
|
13
|
+
# Serializers transform entities into a storable format.
|
14
|
+
module Serializers
|
15
|
+
extend Group
|
16
|
+
self.type = :computer
|
17
|
+
self.targets = [:entity]
|
18
|
+
self.default = :yaml
|
19
|
+
end
|
13
20
|
# Unserializers recreate entities from a serialized format.
|
14
21
|
module Unserializers
|
15
22
|
extend Group
|
@@ -24,14 +31,7 @@ module Treat
|
|
24
31
|
self.targets = [:entity]
|
25
32
|
self.default = :tree
|
26
33
|
end
|
27
|
-
#
|
28
|
-
module Serializers
|
29
|
-
extend Group
|
30
|
-
self.type = :computer
|
31
|
-
self.targets = [:entity]
|
32
|
-
self.default = :yaml
|
33
|
-
end
|
34
|
-
# Serializers transform entities into a storable format.
|
34
|
+
# Cleaners strip a text from its mark up.
|
35
35
|
module Cleaners
|
36
36
|
extend Group
|
37
37
|
self.type = :annotator
|