treat 2.0.3 → 2.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/treat/config/data/languages/agnostic.rb +6 -3
- data/lib/treat/config/data/languages/english.rb +1 -1
- data/lib/treat/config/data/workers/extractors.rb +8 -0
- data/lib/treat/loaders/stanford.rb +2 -0
- data/lib/treat/version.rb +1 -1
- data/lib/treat/workers/extractors/distance/levenshtein.rb +35 -0
- data/lib/treat/workers/extractors/name_tag/stanford.rb +4 -1
- data/lib/treat/workers/extractors/similarity/jaro_winkler.rb +38 -0
- data/lib/treat/workers/extractors/similarity/tf_idf.rb +19 -3
- data/lib/treat/workers/extractors/time/chronic.rb +6 -41
- data/lib/treat/workers/extractors/time/kronic.rb +20 -0
- data/lib/treat/workers/extractors/time/nickel.rb +0 -15
- data/lib/treat/workers/extractors/time/ruby.rb +2 -33
- data/lib/treat/workers/lexicalizers/taggers/stanford.rb +11 -10
- data/lib/treat/workers/processors/parsers/stanford.rb +60 -112
- data/spec/entities/collection.rb +29 -25
- data/spec/entities/document.rb +45 -44
- data/spec/entities/entity.rb +295 -294
- data/spec/entities/phrase.rb +21 -17
- data/spec/entities/token.rb +43 -40
- data/spec/entities/word.rb +5 -1
- data/spec/entities/zone.rb +26 -22
- data/spec/helper.rb +7 -2
- data/spec/learning/data_set.rb +145 -141
- data/spec/learning/export.rb +46 -42
- data/spec/learning/problem.rb +114 -110
- data/spec/learning/question.rb +46 -42
- data/spec/treat.rb +41 -37
- data/spec/workers/agnostic.rb +2 -2
- data/spec/workers/english.rb +12 -12
- metadata +7 -8
- data/files/21552208.html +0 -786
- data/files/nethttp-cheat-sheet-2940.html +0 -393
- data/lib/treat/workers/extractors/similarity/levenshtein.rb +0 -36
- data/spec/sandbox.rb +0 -294
- data/spec/workers/examples/english/mathematicians/euler.html +0 -21
@@ -4,9 +4,10 @@
|
|
4
4
|
'bson_ext', 'mongo', 'lda-ruby',
|
5
5
|
'stanford-core-nlp', 'linguistics',
|
6
6
|
'ruby-readability', 'whatlanguage',
|
7
|
-
'chronic', 'nickel', 'decisiontree',
|
7
|
+
'chronic', 'kronic', 'nickel', 'decisiontree',
|
8
8
|
'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
|
9
|
-
'tf-idf-similarity', 'narray', 'fastimage'
|
9
|
+
'tf-idf-similarity', 'narray', 'fastimage',
|
10
|
+
'fuzzy-string-match', 'levenshtein-ffi'
|
10
11
|
],
|
11
12
|
workers: {
|
12
13
|
learners: {
|
@@ -16,7 +17,9 @@
|
|
16
17
|
keywords: [:tf_idf],
|
17
18
|
language: [:what_language],
|
18
19
|
topic_words: [:lda],
|
19
|
-
tf_idf: [:native]
|
20
|
+
tf_idf: [:native],
|
21
|
+
distance: [:levenshtein],
|
22
|
+
similarity: [:jaro_winkler, :tf_idf]
|
20
23
|
},
|
21
24
|
formatters: {
|
22
25
|
serializers: [:xml, :yaml, :mongo],
|
data/lib/treat/version.rb
CHANGED
@@ -0,0 +1,35 @@
|
|
1
|
+
# The C extension uses char* strings, and so Unicode strings
|
2
|
+
# will give incorrect distances. Need to provide a pure
|
3
|
+
# implementation if that's the case (FIX).
|
4
|
+
class Treat::Workers::Extractors::Distance::Levenshtein
|
5
|
+
|
6
|
+
require 'levenshtein'
|
7
|
+
|
8
|
+
DefaultOptions = {
|
9
|
+
ins_cost: 1,
|
10
|
+
del_cost: 1,
|
11
|
+
sub_cost: 1
|
12
|
+
}
|
13
|
+
|
14
|
+
@@matcher = nil
|
15
|
+
|
16
|
+
# Return the levensthein distance between
|
17
|
+
# two strings taking into account the costs
|
18
|
+
# of insertion, deletion, and substitution.
|
19
|
+
def self.distance(entity, options)
|
20
|
+
|
21
|
+
options = DefaultOptions.merge(options)
|
22
|
+
|
23
|
+
unless options[:to]
|
24
|
+
raise Treat::Exception, "Must supply " +
|
25
|
+
"a string/entity to compare to using " +
|
26
|
+
"the option :to for this worker."
|
27
|
+
end
|
28
|
+
|
29
|
+
a, b = entity.to_s, options[:to].to_s
|
30
|
+
|
31
|
+
Levenshtein.distance(a, b)
|
32
|
+
|
33
|
+
end
|
34
|
+
|
35
|
+
end
|
@@ -1,7 +1,7 @@
|
|
1
1
|
# Named entity tag extraction using the Stanford NLP
|
2
2
|
# Deterministic Coreference Resolver, which implements a
|
3
3
|
# multi-pass sieve coreference resolution (or anaphora
|
4
|
-
# resolution) system.
|
4
|
+
# resolution) system based on conditional random fields.
|
5
5
|
#
|
6
6
|
# Original paper: Heeyoung Lee, Yves Peirsman, Angel
|
7
7
|
# Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
|
@@ -24,6 +24,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
|
|
24
24
|
|
25
25
|
unless classifier = @@classifiers[language]
|
26
26
|
model = Treat::Loaders::Stanford.find_model(:ner, language)
|
27
|
+
unless StanfordCoreNLP.const_defined?('CRFClassifier')
|
28
|
+
StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
|
29
|
+
end
|
27
30
|
classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
|
28
31
|
@@classifiers[language] = classifier
|
29
32
|
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# Similarity measure for short strings such as person names.
|
2
|
+
# C extension won't work for Unicode strings; need to set
|
3
|
+
# extension to "pure" in that case (FIX).
|
4
|
+
class Treat::Workers::Extractors::Similarity::JaroWinkler
|
5
|
+
|
6
|
+
require 'fuzzystringmatch'
|
7
|
+
|
8
|
+
DefaultOptions = {
|
9
|
+
threshold: 0.7,
|
10
|
+
implementation: nil
|
11
|
+
}
|
12
|
+
|
13
|
+
@@matcher = nil
|
14
|
+
|
15
|
+
def self.similarity(entity, options={})
|
16
|
+
|
17
|
+
options = DefaultOptions.merge(options)
|
18
|
+
|
19
|
+
unless options[:to]
|
20
|
+
raise Treat::Exception, "Must supply " +
|
21
|
+
"a string/entity to compare to using " +
|
22
|
+
"the option :to for this worker."
|
23
|
+
end
|
24
|
+
|
25
|
+
unless @@matcher
|
26
|
+
impl = options[:implementation]
|
27
|
+
impl ||= defined?(JRUBY_VERSION) ? :pure : :native
|
28
|
+
klass = FuzzyStringMatch::JaroWinkler
|
29
|
+
@@matcher = klass.create(impl)
|
30
|
+
end
|
31
|
+
|
32
|
+
a, b = entity.to_s, options[:to].to_s
|
33
|
+
|
34
|
+
@@matcher.getDistance(a, b)
|
35
|
+
|
36
|
+
end
|
37
|
+
|
38
|
+
end
|
@@ -2,12 +2,28 @@
|
|
2
2
|
class Treat::Workers::Extractors::Similarity::TfIdf
|
3
3
|
|
4
4
|
require 'tf-idf-similarity'
|
5
|
+
|
6
|
+
def self.similarity(entity, options={})
|
5
7
|
|
6
|
-
|
8
|
+
raise 'Not currently implemented.'
|
9
|
+
|
10
|
+
unless options[:to] &&
|
11
|
+
options[:to].type == :document
|
12
|
+
raise Treat::Exception, 'Must supply ' +
|
13
|
+
'a document to compare to using ' +
|
14
|
+
'the option :to for this worker.'
|
15
|
+
end
|
7
16
|
|
8
|
-
|
17
|
+
unless options[:to].parent_collection &&
|
18
|
+
entity.parent_collection
|
19
|
+
raise Treat::Exception, 'The TF*IDF ' +
|
20
|
+
'similarity algorithm can only be applied ' +
|
21
|
+
'to documents that are inside collections.'
|
22
|
+
end
|
23
|
+
|
9
24
|
coll = TfIdfSimilarity::Collection.new
|
10
|
-
|
25
|
+
|
26
|
+
entity.each_document do |doc|
|
11
27
|
tdoc = TfIdfSimilarity::Document.new(doc.to_s)
|
12
28
|
term_counts = Hash.new(0)
|
13
29
|
doc.each_word do |word|
|
@@ -2,52 +2,17 @@
|
|
2
2
|
# Ruby natural language date parser.
|
3
3
|
class Treat::Workers::Extractors::Time::Chronic
|
4
4
|
|
5
|
-
|
6
|
-
silence_warnings { require 'chronic' }
|
7
|
-
|
8
|
-
# Require the Ruby DateTime module
|
5
|
+
require 'chronic'
|
9
6
|
require 'date'
|
7
|
+
|
8
|
+
DefaultOptions = {guess: true}
|
10
9
|
|
11
10
|
# Return the date information contained within
|
12
11
|
# the entity by parsing it with the 'chronic' gem.
|
13
|
-
#
|
14
|
-
# Options: none.
|
15
12
|
def self.time(entity, options = {})
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
time = nil
|
20
|
-
|
21
|
-
silence_warnings do
|
22
|
-
time = ::Chronic.parse(s, {:guess => true})
|
23
|
-
end
|
24
|
-
|
25
|
-
if entity.has_parent? && remove_time_from_ancestors(entity, time)
|
26
|
-
nil
|
27
|
-
else
|
28
|
-
time
|
29
|
-
end
|
30
|
-
|
31
|
-
end
|
32
|
-
|
33
|
-
# Keeps the lowest-level time annotations that do
|
34
|
-
# not conflict with a higher time annotation.
|
35
|
-
# Returns true if the entity conflicts with a
|
36
|
-
# higher-level time annotation.
|
37
|
-
def self.remove_time_from_ancestors(entity, time)
|
38
|
-
|
39
|
-
entity.ancestors_with_type(:phrase).each do |a|
|
40
|
-
|
41
|
-
next if !a.has?(:time)
|
42
|
-
unless a.get(:time) == time
|
43
|
-
return true
|
44
|
-
end
|
45
|
-
a.unset(:time)
|
46
|
-
|
47
|
-
end
|
48
|
-
|
49
|
-
false
|
50
|
-
|
13
|
+
options = DefaultOptions.merge(options)
|
14
|
+
time = ::Chronic.parse(entity.to_s, options)
|
15
|
+
time ? DateTime.parse(time.to_s) : nil
|
51
16
|
end
|
52
17
|
|
53
18
|
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
# Time/date extraction using a simple rule-based library.
|
2
|
+
#
|
3
|
+
# Supported formats: Today, yesterday, tomorrow,
|
4
|
+
# last thursday, this thursday, 14 Sep, 14 June 2010.
|
5
|
+
# Any dates without a year are assumed to be in the past.
|
6
|
+
class Treat::Workers::Extractors::Time::Kronic
|
7
|
+
|
8
|
+
require 'kronic'
|
9
|
+
require 'date'
|
10
|
+
|
11
|
+
# Return the date information contained within
|
12
|
+
# the entity by parsing it with the 'chronic' gem.
|
13
|
+
#
|
14
|
+
# Options: none.
|
15
|
+
def self.time(entity, options = {})
|
16
|
+
time = Kronic.parse(entity.to_s)
|
17
|
+
time.is_a?(DateTime) ? time : nil
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
@@ -47,7 +47,6 @@ class Treat::Workers::Extractors::Time::Nickel
|
|
47
47
|
occ.interval : :none
|
48
48
|
time_recurrence_interval = interval
|
49
49
|
|
50
|
-
|
51
50
|
s = [occ.start_date, occ.start_time]
|
52
51
|
ds = [s[0].year, s[0].month, s[0].day] if s[0]
|
53
52
|
ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
|
@@ -77,18 +76,4 @@ class Treat::Workers::Extractors::Time::Nickel
|
|
77
76
|
|
78
77
|
end
|
79
78
|
|
80
|
-
# Keeps the lowest-level time annotations that do
|
81
|
-
# not conflict with a higher time annotation.
|
82
|
-
# Returns true if the entity conflicts with a
|
83
|
-
# higher-level time annotation.
|
84
|
-
def self.remove_time_from_ancestors(entity, time)
|
85
|
-
entity.ancestors_with_type(:phrase).each do |a|
|
86
|
-
next if !a.has?(:time)
|
87
|
-
return false unless a.get(:time).to_s == time.to_s
|
88
|
-
a.unset(:time, :time_recurrence,
|
89
|
-
:time_recurrence_interval, :end_time)
|
90
|
-
end
|
91
|
-
true
|
92
|
-
end
|
93
|
-
|
94
79
|
end
|
@@ -2,7 +2,7 @@
|
|
2
2
|
# DateTime.parse() method.
|
3
3
|
class Treat::Workers::Extractors::Time::Ruby
|
4
4
|
|
5
|
-
|
5
|
+
|
6
6
|
require 'date'
|
7
7
|
|
8
8
|
# Return a DateTime object representing the date/time
|
@@ -13,42 +13,11 @@ class Treat::Workers::Extractors::Time::Ruby
|
|
13
13
|
#
|
14
14
|
# Options: none.
|
15
15
|
def self.time(entity, options = {})
|
16
|
-
s = entity.to_s
|
17
|
-
return if s =~ /^[0-9]+$/
|
18
16
|
begin
|
19
|
-
|
20
|
-
if entity.has_parent? &&
|
21
|
-
remove_time_from_ancestors(entity, time)
|
22
|
-
nil
|
23
|
-
else
|
24
|
-
time
|
25
|
-
end
|
17
|
+
DateTime.parse(entity.to_s)
|
26
18
|
rescue
|
27
19
|
nil
|
28
20
|
end
|
29
21
|
end
|
30
22
|
|
31
|
-
|
32
|
-
# Keeps the lowest-level time annotations that do
|
33
|
-
# not conflict with a higher time annotation.
|
34
|
-
# Returns true if the entity conflicts with a
|
35
|
-
# higher-level time annotation.
|
36
|
-
def self.remove_time_from_ancestors(entity, time)
|
37
|
-
|
38
|
-
entity.ancestors_with_type(:phrase).each do |a|
|
39
|
-
|
40
|
-
next if !a.has?(:time)
|
41
|
-
|
42
|
-
unless a.get(:time) == time
|
43
|
-
return true
|
44
|
-
end
|
45
|
-
|
46
|
-
a.unset(:time)
|
47
|
-
|
48
|
-
end
|
49
|
-
|
50
|
-
false
|
51
|
-
|
52
|
-
end
|
53
|
-
|
54
23
|
end
|
@@ -1,10 +1,10 @@
|
|
1
|
-
# POS tagging using
|
2
|
-
#
|
3
|
-
#
|
4
|
-
#
|
5
|
-
# words, (iii)
|
6
|
-
#
|
7
|
-
# unknown word features.
|
1
|
+
# POS tagging using a maximum entropy model, with (i)
|
2
|
+
# explicit use of both preceding and following tag
|
3
|
+
# contexts via a dependency network representation,
|
4
|
+
# (ii) broad use of lexical features, including jointly
|
5
|
+
# conditioning on multiple consecutive words, (iii)
|
6
|
+
# effective use of priors in conditional loglinear models,
|
7
|
+
# and (iv) fine-grained modeling of unknown word features.
|
8
8
|
#
|
9
9
|
# Original paper: Toutanova, Manning, Klein and Singer.
|
10
10
|
# 2003. Feature-Rich Part-of-Speech Tagging with a
|
@@ -21,9 +21,6 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|
21
21
|
:tagger_model => nil
|
22
22
|
}
|
23
23
|
|
24
|
-
# Shortcut for gem config.
|
25
|
-
Config = StanfordCoreNLP::Config
|
26
|
-
|
27
24
|
# Tag the word using one of the Stanford taggers.
|
28
25
|
def self.tag(entity, options = {})
|
29
26
|
|
@@ -64,6 +61,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
|
|
64
61
|
def self.init_tagger(language)
|
65
62
|
unless @@taggers[language]
|
66
63
|
Treat::Loaders::Stanford.load(language)
|
64
|
+
unless StanfordCoreNLP.const_defined?('MaxentTagger')
|
65
|
+
StanfordCoreNLP.load_class('MaxentTagger',
|
66
|
+
'edu.stanford.nlp.tagger.maxent')
|
67
|
+
end
|
67
68
|
model = Treat::Loaders::Stanford.find_model(:pos,language)
|
68
69
|
tagger = StanfordCoreNLP::MaxentTagger.new(model)
|
69
70
|
@@taggers[language] = tagger
|
@@ -1,140 +1,88 @@
|
|
1
|
-
# Parsing using an interface to a Java implementation
|
2
|
-
# of probabilistic natural language parsers, both
|
3
|
-
# optimized PCFG and lexicalized dependency parsers,
|
4
|
-
# and a lexicalized PCFG parser.
|
5
|
-
#
|
6
|
-
# Original paper: Dan Klein and Christopher D.
|
7
|
-
# Manning. 2003. Accurate Unlexicalized Parsing.
|
8
|
-
# Proceedings of the 41st Meeting of the Association
|
1
|
+
# Parsing using an interface to a Java implementation
|
2
|
+
# of probabilistic natural language parsers, both
|
3
|
+
# optimized PCFG and lexicalized dependency parsers,
|
4
|
+
# and a lexicalized PCFG parser.
|
5
|
+
#
|
6
|
+
# Original paper: Dan Klein and Christopher D.
|
7
|
+
# Manning. 2003. Accurate Unlexicalized Parsing.
|
8
|
+
# Proceedings of the 41st Meeting of the Association
|
9
9
|
# for Computational Linguistics, pp. 423-430.
|
10
10
|
class Treat::Workers::Processors::Parsers::Stanford
|
11
|
-
|
11
|
+
|
12
12
|
Pttc = Treat.tags.aligned.phrase_tags_to_category
|
13
|
-
|
13
|
+
|
14
14
|
# Hold one instance of the pipeline per language.
|
15
15
|
@@parsers = {}
|
16
16
|
|
17
|
-
DefaultOptions = {
|
18
|
-
:parser_model => nil,
|
19
|
-
:tagger_model => nil
|
20
|
-
}
|
17
|
+
DefaultOptions = { model: nil }
|
21
18
|
|
22
19
|
# Parse the entity using the Stanford parser.
|
23
|
-
#
|
24
|
-
# Options:
|
25
|
-
#
|
26
|
-
# - (Boolean) :silent => whether to silence the output
|
27
|
-
# of the JVM.
|
28
|
-
# - (String) :log_file => a filename to log output to
|
29
|
-
# instead of displaying it.
|
30
20
|
def self.parse(entity, options = {})
|
31
21
|
|
32
|
-
val, lang = entity.to_s, entity.language
|
33
|
-
|
34
|
-
|
35
|
-
entity.check_hasnt_children
|
22
|
+
val, lang = entity.to_s, entity.language.intern
|
23
|
+
|
24
|
+
Treat::Loaders::Stanford.load(lang)
|
36
25
|
|
37
26
|
tag_set = StanfordCoreNLP::Config::TagSets[lang]
|
38
27
|
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
end
|
56
|
-
|
28
|
+
list = get_token_list(entity)
|
29
|
+
entity.remove_all!
|
30
|
+
|
31
|
+
model_file = options[:model] ||
|
32
|
+
StanfordCoreNLP::Config::Models[:parse][lang]
|
33
|
+
|
34
|
+
unless @@parsers[lang] && @@parsers[lang][model_file]
|
35
|
+
model_path = Treat.libraries.stanford.model_path ||
|
36
|
+
StanfordCoreNLP.model_path
|
37
|
+
model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
|
38
|
+
model = File.join(model_path, model_folder, model_file)
|
39
|
+
@@parsers[lang] ||= {}
|
40
|
+
options = StanfordCoreNLP::Options.new
|
41
|
+
parser = StanfordCoreNLP::LexicalizedParser
|
42
|
+
.getParserFromFile(model, options)
|
43
|
+
@@parsers[lang][model_file] = parser
|
57
44
|
end
|
58
|
-
|
59
|
-
entity.set :tag_set, tag_set
|
60
45
|
|
61
|
-
|
46
|
+
parser = @@parsers[lang][model_file]
|
47
|
+
|
48
|
+
text = parser.apply(list)
|
49
|
+
|
50
|
+
recurse(text.children[0], entity, tag_set)
|
51
|
+
entity.set :tag_set, tag_set
|
62
52
|
|
63
|
-
def self.init(lang, options)
|
64
|
-
Treat::Loaders::Stanford.load(lang)
|
65
|
-
options = DefaultOptions.merge(options)
|
66
|
-
StanfordCoreNLP.use(lang.intern)
|
67
|
-
if options[:tagger_model]
|
68
|
-
StanfordCoreNLP.set_model('pos.model', options[:tagger_model])
|
69
|
-
end
|
70
|
-
if options[:parser_model]
|
71
|
-
StanfordCoreNLP.set_model('parser.model', options[:parser_model])
|
72
|
-
end
|
73
|
-
annotators = [:tokenize, :ssplit, :pos, :lemma, :parse]
|
74
|
-
@@parsers[lang] = StanfordCoreNLP.load(*annotators)
|
75
53
|
end
|
76
54
|
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
if java_node.num_children == 0
|
82
|
-
|
83
|
-
label = java_node.label
|
84
|
-
tag = label.get(:part_of_speech).to_s
|
85
|
-
tag_s, tag_opt = *tag.split('-')
|
86
|
-
tag_s ||= ''
|
87
|
-
ruby_node.value = java_node.value.to_s.strip
|
88
|
-
ruby_node.set :tag, tag_s
|
89
|
-
ruby_node.set :tag_opt, tag_opt if tag_opt
|
90
|
-
ruby_node.set :lemma, label.get(:lemma).to_s
|
91
|
-
|
92
|
-
additional_tags.each do |t|
|
93
|
-
lt = label.get(t)
|
94
|
-
ruby_node.set t, lt.to_s if lt
|
95
|
-
end
|
96
|
-
|
97
|
-
ruby_node
|
98
|
-
|
99
|
-
else
|
100
|
-
|
101
|
-
if java_node.num_children == 1 &&
|
102
|
-
java_node.children[0].num_children == 0
|
103
|
-
recurse(java_node.children[0],
|
104
|
-
ruby_node, tag_set, additional_tags)
|
105
|
-
return
|
106
|
-
end
|
55
|
+
def self.recurse(java_node, ruby_node, tag_set)
|
56
|
+
|
57
|
+
java_node.children.each do |java_child|
|
107
58
|
|
108
|
-
|
109
|
-
|
110
|
-
label = java_child.label
|
111
|
-
tag = label.get(:category).to_s
|
112
|
-
tag_s, tag_opt = *tag.split('-')
|
113
|
-
tag_s ||= ''
|
114
|
-
|
115
|
-
if Pttc[tag_s] && Pttc[tag_s][tag_set]
|
116
|
-
ruby_child = Treat::Entities::Phrase.new
|
117
|
-
else
|
118
|
-
l = java_child.children[0].to_s
|
119
|
-
v = java_child.children[0].value.to_s.strip
|
120
|
-
|
121
|
-
# Mhmhmhmhmhm FIX!
|
122
|
-
val = (l == v) ? v : l.split(' ')[-1].gsub(')', '')
|
123
|
-
ruby_child = Treat::Entities::Token.from_string(val)
|
124
|
-
end
|
59
|
+
label = java_child.label
|
60
|
+
tag = label.get(:category).to_s
|
125
61
|
|
126
|
-
|
127
|
-
ruby_child
|
62
|
+
if Pttc[tag] && Pttc[tag][tag_set]
|
63
|
+
ruby_child = Treat::Entities::Phrase.new
|
64
|
+
ruby_child.set :tag, tag
|
128
65
|
ruby_node << ruby_child
|
129
|
-
|
130
66
|
unless java_child.children.empty?
|
131
|
-
recurse(java_child, ruby_child, tag_set
|
67
|
+
recurse(java_child, ruby_child, tag_set)
|
132
68
|
end
|
133
|
-
|
69
|
+
else
|
70
|
+
val = java_child.children[0].to_s
|
71
|
+
ruby_child = Treat::Entities::Token.from_string(val)
|
72
|
+
ruby_child.set :tag, tag
|
73
|
+
ruby_node << ruby_child
|
134
74
|
end
|
135
|
-
|
75
|
+
|
136
76
|
end
|
137
77
|
|
138
78
|
end
|
139
|
-
|
79
|
+
|
80
|
+
def self.get_token_list(entity)
|
81
|
+
list = StanfordCoreNLP::ArrayList.new
|
82
|
+
entity.tokens.each do |token|
|
83
|
+
list.add(StanfordCoreNLP::Word.new(token.to_s))
|
84
|
+
end
|
85
|
+
list
|
86
|
+
end
|
87
|
+
|
140
88
|
end
|