treat 2.0.3 → 2.0.4
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/treat/config/data/languages/agnostic.rb +6 -3
- data/lib/treat/config/data/languages/english.rb +1 -1
- data/lib/treat/config/data/workers/extractors.rb +8 -0
- data/lib/treat/loaders/stanford.rb +2 -0
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/distance/levenshtein.rb +35 -0
- data/lib/treat/workers/extractors/name_tag/stanford.rb +4 -1
- data/lib/treat/workers/extractors/similarity/jaro_winkler.rb +38 -0
- data/lib/treat/workers/extractors/similarity/tf_idf.rb +19 -3
- data/lib/treat/workers/extractors/time/chronic.rb +6 -41
- data/lib/treat/workers/extractors/time/kronic.rb +20 -0
- data/lib/treat/workers/extractors/time/nickel.rb +0 -15
- data/lib/treat/workers/extractors/time/ruby.rb +2 -33
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +11 -10
- data/lib/treat/workers/processors/parsers/stanford.rb +60 -112
- data/spec/entities/collection.rb +29 -25
- data/spec/entities/document.rb +45 -44
- data/spec/entities/entity.rb +295 -294
- data/spec/entities/phrase.rb +21 -17
- data/spec/entities/token.rb +43 -40
- data/spec/entities/word.rb +5 -1
- data/spec/entities/zone.rb +26 -22
- data/spec/helper.rb +7 -2
- data/spec/learning/data_set.rb +145 -141
- data/spec/learning/export.rb +46 -42
- data/spec/learning/problem.rb +114 -110
- data/spec/learning/question.rb +46 -42
- data/spec/treat.rb +41 -37
- data/spec/workers/agnostic.rb +2 -2
- data/spec/workers/english.rb +12 -12
- metadata +7 -8
- data/files/21552208.html +0 -786
- data/files/nethttp-cheat-sheet-2940.html +0 -393
- data/lib/treat/workers/extractors/similarity/levenshtein.rb +0 -36
- data/spec/sandbox.rb +0 -294
- data/spec/workers/examples/english/mathematicians/euler.html +0 -21
@@ -4,9 +4,10 @@
|
|
4
4
|
'bson_ext', 'mongo', 'lda-ruby',
|
5
5
|
'stanford-core-nlp', 'linguistics',
|
6
6
|
'ruby-readability', 'whatlanguage',
|
7
|
-
'chronic', 'nickel', 'decisiontree',
|
7
|
+
'chronic', 'kronic', 'nickel', 'decisiontree',
|
8
8
|
'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
|
9
|
-
'tf-idf-similarity', 'narray', 'fastimage'
|
9
|
+
'tf-idf-similarity', 'narray', 'fastimage',
|
10
|
+
'fuzzy-string-match', 'levenshtein-ffi'
|
10
11
|
],
|
11
12
|
workers: {
|
12
13
|
learners: {
|
@@ -16,7 +17,9 @@
|
|
16
17
|
keywords: [:tf_idf],
|
17
18
|
language: [:what_language],
|
18
19
|
topic_words: [:lda],
|
19
|
-
tf_idf: [:native]
|
20
|
+
tf_idf: [:native],
|
21
|
+
distance: [:levenshtein],
|
22
|
+
similarity: [:jaro_winkler, :tf_idf]
|
20
23
|
},
|
21
24
|
formatters: {
|
22
25
|
serializers: [:xml, :yaml, :mongo],
|
data/lib/treat/version.rb
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
# The C extension uses char* strings, and so Unicode strings
|
2
|
+
# will give incorrect distances. Need to provide a pure
|
3
|
+
# implementation if that's the case (FIX).
|
4
|
+
class Treat::Workers::Extractors::Distance::Levenshtein
|
5
|
+
|
6
|
+
require 'levenshtein'
|
7
|
+
|
8
|
+
DefaultOptions = {
|
9
|
+
ins_cost: 1,
|
10
|
+
del_cost: 1,
|
11
|
+
sub_cost: 1
|
12
|
+
}
|
13
|
+
|
14
|
+
@@matcher = nil
|
15
|
+
|
16
|
+
# Return the levensthein distance between
|
17
|
+
# two strings taking into account the costs
|
18
|
+
# of insertion, deletion, and substitution.
|
19
|
+
def self.distance(entity, options)
|
20
|
+
|
21
|
+
options = DefaultOptions.merge(options)
|
22
|
+
|
23
|
+
unless options[:to]
|
24
|
+
raise Treat::Exception, "Must supply " +
|
25
|
+
"a string/entity to compare to using " +
|
26
|
+
"the option :to for this worker."
|
27
|
+
end
|
28
|
+
|
29
|
+
a, b = entity.to_s, options[:to].to_s
|
30
|
+
|
31
|
+
Levenshtein.distance(a, b)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Named entity tag extraction using the Stanford NLP
|
2
2
|
# Deterministic Coreference Resolver, which implements a
|
3
3
|
# multi-pass sieve coreference resolution (or anaphora
|
4
|
-
# resolution) system.
|
4
|
+
# resolution) system based on conditional random fields.
|
5
5
|
#
|
6
6
|
# Original paper: Heeyoung Lee, Yves Peirsman, Angel
|
7
7
|
# Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
|
@@ -24,6 +24,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
|
24
24
|
|
25
25
|
unless classifier = @@classifiers[language]
|
26
26
|
model = Treat::Loaders::Stanford.find_model(:ner, language)
|
27
|
+
unless StanfordCoreNLP.const_defined?('CRFClassifier')
|
28
|
+
StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
|
29
|
+
end
|
27
30
|
classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
|
28
31
|
@@classifiers[language] = classifier
|
29
32
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# Similarity measure for short strings such as person names.
|
2
|
+
# C extension won't work for Unicode strings; need to set
|
3
|
+
# extension to "pure" in that case (FIX).
|
4
|
+
class Treat::Workers::Extractors::Similarity::JaroWinkler
|
5
|
+
|
6
|
+
require 'fuzzystringmatch'
|
7
|
+
|
8
|
+
DefaultOptions = {
|
9
|
+
threshold: 0.7,
|
10
|
+
implementation: nil
|
11
|
+
}
|
12
|
+
|
13
|
+
@@matcher = nil
|
14
|
+
|
15
|
+
def self.similarity(entity, options={})
|
16
|
+
|
17
|
+
options = DefaultOptions.merge(options)
|
18
|
+
|
19
|
+
unless options[:to]
|
20
|
+
raise Treat::Exception, "Must supply " +
|
21
|
+
"a string/entity to compare to using " +
|
22
|
+
"the option :to for this worker."
|
23
|
+
end
|
24
|
+
|
25
|
+
unless @@matcher
|
26
|
+
impl = options[:implementation]
|
27
|
+
impl ||= defined?(JRUBY_VERSION) ? :pure : :native
|
28
|
+
klass = FuzzyStringMatch::JaroWinkler
|
29
|
+
@@matcher = klass.create(impl)
|
30
|
+
end
|
31
|
+
|
32
|
+
a, b = entity.to_s, options[:to].to_s
|
33
|
+
|
34
|
+
@@matcher.getDistance(a, b)
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -2,12 +2,28 @@
|
|
2
2
|
class Treat::Workers::Extractors::Similarity::TfIdf
|
3
3
|
|
4
4
|
require 'tf-idf-similarity'
|
5
|
+
|
6
|
+
def self.similarity(entity, options={})
|
5
7
|
|
6
|
-
|
8
|
+
raise 'Not currently implemented.'
|
9
|
+
|
10
|
+
unless options[:to] &&
|
11
|
+
options[:to].type == :document
|
12
|
+
raise Treat::Exception, 'Must supply ' +
|
13
|
+
'a document to compare to using ' +
|
14
|
+
'the option :to for this worker.'
|
15
|
+
end
|
7
16
|
|
8
|
-
|
17
|
+
unless options[:to].parent_collection &&
|
18
|
+
entity.parent_collection
|
19
|
+
raise Treat::Exception, 'The TF*IDF ' +
|
20
|
+
'similarity algorithm can only be applied ' +
|
21
|
+
'to documents that are inside collections.'
|
22
|
+
end
|
23
|
+
|
9
24
|
coll = TfIdfSimilarity::Collection.new
|
10
|
-
|
25
|
+
|
26
|
+
entity.each_document do |doc|
|
11
27
|
tdoc = TfIdfSimilarity::Document.new(doc.to_s)
|
12
28
|
term_counts = Hash.new(0)
|
13
29
|
doc.each_word do |word|
|
@@ -2,52 +2,17 @@
|
|
2
2
|
# Ruby natural language date parser.
|
3
3
|
class Treat::Workers::Extractors::Time::Chronic
|
4
4
|
|
5
|
-
|
6
|
-
silence_warnings { require 'chronic' }
|
7
|
-
|
8
|
-
# Require the Ruby DateTime module
|
5
|
+
require 'chronic'
|
9
6
|
require 'date'
|
7
|
+
|
8
|
+
DefaultOptions = {guess: true}
|
10
9
|
|
11
10
|
# Return the date information contained within
|
12
11
|
# the entity by parsing it with the 'chronic' gem.
|
13
|
-
#
|
14
|
-
# Options: none.
|
15
12
|
def self.time(entity, options = {})
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
time = nil
|
20
|
-
|
21
|
-
silence_warnings do
|
22
|
-
time = ::Chronic.parse(s, {:guess => true})
|
23
|
-
end
|
24
|
-
|
25
|
-
if entity.has_parent? && remove_time_from_ancestors(entity, time)
|
26
|
-
nil
|
27
|
-
else
|
28
|
-
time
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
# Keeps the lowest-level time annotations that do
|
34
|
-
# not conflict with a higher time annotation.
|
35
|
-
# Returns true if the entity conflicts with a
|
36
|
-
# higher-level time annotation.
|
37
|
-
def self.remove_time_from_ancestors(entity, time)
|
38
|
-
|
39
|
-
entity.ancestors_with_type(:phrase).each do |a|
|
40
|
-
|
41
|
-
next if !a.has?(:time)
|
42
|
-
unless a.get(:time) == time
|
43
|
-
return true
|
44
|
-
end
|
45
|
-
a.unset(:time)
|
46
|
-
|
47
|
-
end
|
48
|
-
|
49
|
-
false
|
50
|
-
|
13
|
+
options = DefaultOptions.merge(options)
|
14
|
+
time = ::Chronic.parse(entity.to_s, options)
|
15
|
+
time ? DateTime.parse(time.to_s) : nil
|
51
16
|
end
|
52
17
|
|
53
18
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Time/date extraction using a simple rule-based library.
|
2
|
+
#
|
3
|
+
# Supported formats: Today, yesterday, tomorrow,
|
4
|
+
# last thursday, this thursday, 14 Sep, 14 June 2010.
|
5
|
+
# Any dates without a year are assumed to be in the past.
|
6
|
+
class Treat::Workers::Extractors::Time::Kronic
|
7
|
+
|
8
|
+
require 'kronic'
|
9
|
+
require 'date'
|
10
|
+
|
11
|
+
# Return the date information contained within
|
12
|
+
# the entity by parsing it with the 'chronic' gem.
|
13
|
+
#
|
14
|
+
# Options: none.
|
15
|
+
def self.time(entity, options = {})
|
16
|
+
time = Kronic.parse(entity.to_s)
|
17
|
+
time.is_a?(DateTime) ? time : nil
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -47,7 +47,6 @@ class Treat::Workers::Extractors::Time::Nickel
|
|
47
47
|
occ.interval : :none
|
48
48
|
time_recurrence_interval = interval
|
49
49
|
|
50
|
-
|
51
50
|
s = [occ.start_date, occ.start_time]
|
52
51
|
ds = [s[0].year, s[0].month, s[0].day] if s[0]
|
53
52
|
ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
|
@@ -77,18 +76,4 @@ class Treat::Workers::Extractors::Time::Nickel
|
|
77
76
|
|
78
77
|
end
|
79
78
|
|
80
|
-
# Keeps the lowest-level time annotations that do
|
81
|
-
# not conflict with a higher time annotation.
|
82
|
-
# Returns true if the entity conflicts with a
|
83
|
-
# higher-level time annotation.
|
84
|
-
def self.remove_time_from_ancestors(entity, time)
|
85
|
-
entity.ancestors_with_type(:phrase).each do |a|
|
86
|
-
next if !a.has?(:time)
|
87
|
-
return false unless a.get(:time).to_s == time.to_s
|
88
|
-
a.unset(:time, :time_recurrence,
|
89
|
-
:time_recurrence_interval, :end_time)
|
90
|
-
end
|
91
|
-
true
|
92
|
-
end
|
93
|
-
|
94
79
|
end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# DateTime.parse() method.
|
3
3
|
class Treat::Workers::Extractors::Time::Ruby
|
4
4
|
|
5
|
-
|
5
|
+
|
6
6
|
require 'date'
|
7
7
|
|
8
8
|
# Return a DateTime object representing the date/time
|
@@ -13,42 +13,11 @@ class Treat::Workers::Extractors::Time::Ruby
|
|
13
13
|
#
|
14
14
|
# Options: none.
|
15
15
|
def self.time(entity, options = {})
|
16
|
-
s = entity.to_s
|
17
|
-
return if s =~ /^[0-9]+$/
|
18
16
|
begin
|
19
|
-
|
20
|
-
if entity.has_parent? &&
|
21
|
-
remove_time_from_ancestors(entity, time)
|
22
|
-
nil
|
23
|
-
else
|
24
|
-
time
|
25
|
-
end
|
17
|
+
DateTime.parse(entity.to_s)
|
26
18
|
rescue
|
27
19
|
nil
|
28
20
|
end
|
29
21
|
end
|
30
22
|
|
31
|
-
|
32
|
-
# Keeps the lowest-level time annotations that do
|
33
|
-
# not conflict with a higher time annotation.
|
34
|
-
# Returns true if the entity conflicts with a
|
35
|
-
# higher-level time annotation.
|
36
|
-
def self.remove_time_from_ancestors(entity, time)
|
37
|
-
|
38
|
-
entity.ancestors_with_type(:phrase).each do |a|
|
39
|
-
|
40
|
-
next if !a.has?(:time)
|
41
|
-
|
42
|
-
unless a.get(:time) == time
|
43
|
-
return true
|
44
|
-
end
|
45
|
-
|
46
|
-
a.unset(:time)
|
47
|
-
|
48
|
-
end
|
49
|
-
|
50
|
-
false
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
23
|
end
|
@@ -1,10 +1,10 @@
|
|
1
|
-
# POS tagging using
|
2
|
-
#
|
3
|
-
#
|
4
|
-
#
|
5
|
-
# words, (iii)
|
6
|
-
#
|
7
|
-
# unknown word features.
|
1
|
+
# POS tagging using a maximum entropy model, with (i)
|
2
|
+
# explicit use of both preceding and following tag
|
3
|
+
# contexts via a dependency network representation,
|
4
|
+
# (ii) broad use of lexical features, including jointly
|
5
|
+
# conditioning on multiple consecutive words, (iii)
|
6
|
+
# effective use of priors in conditional loglinear models,
|
7
|
+
# and (iv) fine-grained modeling of unknown word features.
|
8
8
|
#
|
9
9
|
# Original paper: Toutanova, Manning, Klein and Singer.
|
10
10
|
# 2003. Feature-Rich Part-of-Speech Tagging with a
|
@@ -21,9 +21,6 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|
21
21
|
:tagger_model => nil
|
22
22
|
}
|
23
23
|
|
24
|
-
# Shortcut for gem config.
|
25
|
-
Config = StanfordCoreNLP::Config
|
26
|
-
|
27
24
|
# Tag the word using one of the Stanford taggers.
|
28
25
|
def self.tag(entity, options = {})
|
29
26
|
|
@@ -64,6 +61,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|
64
61
|
def self.init_tagger(language)
|
65
62
|
unless @@taggers[language]
|
66
63
|
Treat::Loaders::Stanford.load(language)
|
64
|
+
unless StanfordCoreNLP.const_defined?('MaxentTagger')
|
65
|
+
StanfordCoreNLP.load_class('MaxentTagger',
|
66
|
+
'edu.stanford.nlp.tagger.maxent')
|
67
|
+
end
|
67
68
|
model = Treat::Loaders::Stanford.find_model(:pos,language)
|
68
69
|
tagger = StanfordCoreNLP::MaxentTagger.new(model)
|
69
70
|
@@taggers[language] = tagger
|
@@ -1,140 +1,88 @@
|
|
1
|
-
# Parsing using an interface to a Java implementation
|
2
|
-
# of probabilistic natural language parsers, both
|
3
|
-
# optimized PCFG and lexicalized dependency parsers,
|
4
|
-
# and a lexicalized PCFG parser.
|
5
|
-
#
|
6
|
-
# Original paper: Dan Klein and Christopher D.
|
7
|
-
# Manning. 2003. Accurate Unlexicalized Parsing.
|
8
|
-
# Proceedings of the 41st Meeting of the Association
|
1
|
+
# Parsing using an interface to a Java implementation
|
2
|
+
# of probabilistic natural language parsers, both
|
3
|
+
# optimized PCFG and lexicalized dependency parsers,
|
4
|
+
# and a lexicalized PCFG parser.
|
5
|
+
#
|
6
|
+
# Original paper: Dan Klein and Christopher D.
|
7
|
+
# Manning. 2003. Accurate Unlexicalized Parsing.
|
8
|
+
# Proceedings of the 41st Meeting of the Association
|
9
9
|
# for Computational Linguistics, pp. 423-430.
|
10
10
|
class Treat::Workers::Processors::Parsers::Stanford
|
11
|
-
|
11
|
+
|
12
12
|
Pttc = Treat.tags.aligned.phrase_tags_to_category
|
13
|
-
|
13
|
+
|
14
14
|
# Hold one instance of the pipeline per language.
|
15
15
|
@@parsers = {}
|
16
16
|
|
17
|
-
DefaultOptions = {
|
18
|
-
:parser_model => nil,
|
19
|
-
:tagger_model => nil
|
20
|
-
}
|
17
|
+
DefaultOptions = { model: nil }
|
21
18
|
|
22
19
|
# Parse the entity using the Stanford parser.
|
23
|
-
#
|
24
|
-
# Options:
|
25
|
-
#
|
26
|
-
# - (Boolean) :silent => whether to silence the output
|
27
|
-
# of the JVM.
|
28
|
-
# - (String) :log_file => a filename to log output to
|
29
|
-
# instead of displaying it.
|
30
20
|
def self.parse(entity, options = {})
|
31
21
|
|
32
|
-
val, lang = entity.to_s, entity.language
|
33
|
-
|
34
|
-
|
35
|
-
entity.check_hasnt_children
|
22
|
+
val, lang = entity.to_s, entity.language.intern
|
23
|
+
|
24
|
+
Treat::Loaders::Stanford.load(lang)
|
36
25
|
|
37
26
|
tag_set = StanfordCoreNLP::Config::TagSets[lang]
|
38
27
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
56
|
-
|
28
|
+
list = get_token_list(entity)
|
29
|
+
entity.remove_all!
|
30
|
+
|
31
|
+
model_file = options[:model] ||
|
32
|
+
StanfordCoreNLP::Config::Models[:parse][lang]
|
33
|
+
|
34
|
+
unless @@parsers[lang] && @@parsers[lang][model_file]
|
35
|
+
model_path = Treat.libraries.stanford.model_path ||
|
36
|
+
StanfordCoreNLP.model_path
|
37
|
+
model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
|
38
|
+
model = File.join(model_path, model_folder, model_file)
|
39
|
+
@@parsers[lang] ||= {}
|
40
|
+
options = StanfordCoreNLP::Options.new
|
41
|
+
parser = StanfordCoreNLP::LexicalizedParser
|
42
|
+
.getParserFromFile(model, options)
|
43
|
+
@@parsers[lang][model_file] = parser
|
57
44
|
end
|
58
|
-
|
59
|
-
entity.set :tag_set, tag_set
|
60
45
|
|
61
|
-
|
46
|
+
parser = @@parsers[lang][model_file]
|
47
|
+
|
48
|
+
text = parser.apply(list)
|
49
|
+
|
50
|
+
recurse(text.children[0], entity, tag_set)
|
51
|
+
entity.set :tag_set, tag_set
|
62
52
|
|
63
|
-
def self.init(lang, options)
|
64
|
-
Treat::Loaders::Stanford.load(lang)
|
65
|
-
options = DefaultOptions.merge(options)
|
66
|
-
StanfordCoreNLP.use(lang.intern)
|
67
|
-
if options[:tagger_model]
|
68
|
-
StanfordCoreNLP.set_model('pos.model', options[:tagger_model])
|
69
|
-
end
|
70
|
-
if options[:parser_model]
|
71
|
-
StanfordCoreNLP.set_model('parser.model', options[:parser_model])
|
72
|
-
end
|
73
|
-
annotators = [:tokenize, :ssplit, :pos, :lemma, :parse]
|
74
|
-
@@parsers[lang] = StanfordCoreNLP.load(*annotators)
|
75
53
|
end
|
76
54
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
if java_node.num_children == 0
|
82
|
-
|
83
|
-
label = java_node.label
|
84
|
-
tag = label.get(:part_of_speech).to_s
|
85
|
-
tag_s, tag_opt = *tag.split('-')
|
86
|
-
tag_s ||= ''
|
87
|
-
ruby_node.value = java_node.value.to_s.strip
|
88
|
-
ruby_node.set :tag, tag_s
|
89
|
-
ruby_node.set :tag_opt, tag_opt if tag_opt
|
90
|
-
ruby_node.set :lemma, label.get(:lemma).to_s
|
91
|
-
|
92
|
-
additional_tags.each do |t|
|
93
|
-
lt = label.get(t)
|
94
|
-
ruby_node.set t, lt.to_s if lt
|
95
|
-
end
|
96
|
-
|
97
|
-
ruby_node
|
98
|
-
|
99
|
-
else
|
100
|
-
|
101
|
-
if java_node.num_children == 1 &&
|
102
|
-
java_node.children[0].num_children == 0
|
103
|
-
recurse(java_node.children[0],
|
104
|
-
ruby_node, tag_set, additional_tags)
|
105
|
-
return
|
106
|
-
end
|
55
|
+
def self.recurse(java_node, ruby_node, tag_set)
|
56
|
+
|
57
|
+
java_node.children.each do |java_child|
|
107
58
|
|
108
|
-
|
109
|
-
|
110
|
-
label = java_child.label
|
111
|
-
tag = label.get(:category).to_s
|
112
|
-
tag_s, tag_opt = *tag.split('-')
|
113
|
-
tag_s ||= ''
|
114
|
-
|
115
|
-
if Pttc[tag_s] && Pttc[tag_s][tag_set]
|
116
|
-
ruby_child = Treat::Entities::Phrase.new
|
117
|
-
else
|
118
|
-
l = java_child.children[0].to_s
|
119
|
-
v = java_child.children[0].value.to_s.strip
|
120
|
-
|
121
|
-
# Mhmhmhmhmhm FIX!
|
122
|
-
val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
|
123
|
-
ruby_child = Treat::Entities::Token.from_string(val)
|
124
|
-
end
|
59
|
+
label = java_child.label
|
60
|
+
tag = label.get(:category).to_s
|
125
61
|
|
126
|
-
|
127
|
-
ruby_child
|
62
|
+
if Pttc[tag] && Pttc[tag][tag_set]
|
63
|
+
ruby_child = Treat::Entities::Phrase.new
|
64
|
+
ruby_child.set :tag, tag
|
128
65
|
ruby_node << ruby_child
|
129
|
-
|
130
66
|
unless java_child.children.empty?
|
131
|
-
recurse(java_child, ruby_child, tag_set
|
67
|
+
recurse(java_child, ruby_child, tag_set)
|
132
68
|
end
|
133
|
-
|
69
|
+
else
|
70
|
+
val = java_child.children[0].to_s
|
71
|
+
ruby_child = Treat::Entities::Token.from_string(val)
|
72
|
+
ruby_child.set :tag, tag
|
73
|
+
ruby_node << ruby_child
|
134
74
|
end
|
135
|
-
|
75
|
+
|
136
76
|
end
|
137
77
|
|
138
78
|
end
|
139
|
-
|
79
|
+
|
80
|
+
def self.get_token_list(entity)
|
81
|
+
list = StanfordCoreNLP::ArrayList.new
|
82
|
+
entity.tokens.each do |token|
|
83
|
+
list.add(StanfordCoreNLP::Word.new(token.to_s))
|
84
|
+
end
|
85
|
+
list
|
86
|
+
end
|
87
|
+
|
140
88
|
end
|