treat 0.1.4 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +4 -4
- data/TODO +21 -54
- data/lib/economist/half_cocked_basel.txt +16 -0
- data/lib/economist/hose_and_dry.doc +0 -0
- data/lib/economist/hungarys_troubles.abw +70 -0
- data/lib/economist/republican_nomination.pdf +0 -0
- data/lib/economist/saving_the_euro.odt +0 -0
- data/lib/economist/to_infinity_and_beyond.txt +15 -0
- data/lib/economist/zero_sum.html +91 -0
- data/lib/treat.rb +58 -72
- data/lib/treat/buildable.rb +59 -15
- data/lib/treat/categories.rb +26 -14
- data/lib/treat/category.rb +2 -2
- data/lib/treat/delegatable.rb +65 -48
- data/lib/treat/doable.rb +44 -0
- data/lib/treat/entities.rb +34 -14
- data/lib/treat/entities/collection.rb +2 -0
- data/lib/treat/entities/document.rb +3 -2
- data/lib/treat/entities/entity.rb +105 -90
- data/lib/treat/entities/phrases.rb +17 -0
- data/lib/treat/entities/tokens.rb +28 -13
- data/lib/treat/entities/zones.rb +20 -0
- data/lib/treat/extractors.rb +49 -11
- data/lib/treat/extractors/coreferences/stanford.rb +68 -0
- data/lib/treat/extractors/date/chronic.rb +32 -0
- data/lib/treat/extractors/date/ruby.rb +25 -0
- data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
- data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
- data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
- data/lib/treat/extractors/language/what_language.rb +49 -0
- data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
- data/lib/treat/extractors/roles/naive.rb +73 -0
- data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
- data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
- data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
- data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
- data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
- data/lib/treat/extractors/time/nickel.rb +30 -12
- data/lib/treat/extractors/topic_words/lda.rb +9 -9
- data/lib/treat/extractors/topics/reuters.rb +14 -15
- data/lib/treat/extractors/topics/reuters/region.xml +1 -0
- data/lib/treat/features.rb +7 -0
- data/lib/treat/formatters/readers/abw.rb +6 -1
- data/lib/treat/formatters/readers/autoselect.rb +5 -6
- data/lib/treat/formatters/readers/doc.rb +3 -1
- data/lib/treat/formatters/readers/html.rb +1 -1
- data/lib/treat/formatters/readers/image.rb +43 -0
- data/lib/treat/formatters/readers/odt.rb +1 -2
- data/lib/treat/formatters/readers/pdf.rb +9 -1
- data/lib/treat/formatters/readers/xml.rb +40 -0
- data/lib/treat/formatters/serializers/xml.rb +50 -14
- data/lib/treat/formatters/serializers/yaml.rb +7 -2
- data/lib/treat/formatters/unserializers/xml.rb +33 -7
- data/lib/treat/formatters/visualizers/dot.rb +90 -20
- data/lib/treat/formatters/visualizers/short_value.rb +2 -2
- data/lib/treat/formatters/visualizers/standoff.rb +2 -2
- data/lib/treat/formatters/visualizers/tree.rb +1 -1
- data/lib/treat/formatters/visualizers/txt.rb +13 -4
- data/lib/treat/group.rb +16 -10
- data/lib/treat/helpers/linguistics_loader.rb +18 -0
- data/lib/treat/inflectors.rb +10 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
- data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
- data/lib/treat/inflectors/declensions/english.rb +319 -0
- data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
- data/lib/treat/install.rb +59 -0
- data/lib/treat/kernel.rb +18 -8
- data/lib/treat/languages.rb +18 -11
- data/lib/treat/languages/arabic.rb +4 -2
- data/lib/treat/languages/chinese.rb +6 -2
- data/lib/treat/languages/dutch.rb +16 -0
- data/lib/treat/languages/english.rb +47 -19
- data/lib/treat/languages/french.rb +8 -5
- data/lib/treat/languages/german.rb +9 -6
- data/lib/treat/languages/greek.rb +16 -0
- data/lib/treat/languages/italian.rb +6 -3
- data/lib/treat/languages/polish.rb +16 -0
- data/lib/treat/languages/portuguese.rb +16 -0
- data/lib/treat/languages/russian.rb +16 -0
- data/lib/treat/languages/spanish.rb +16 -0
- data/lib/treat/languages/swedish.rb +16 -0
- data/lib/treat/languages/tags.rb +377 -0
- data/lib/treat/lexicalizers.rb +34 -23
- data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
- data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
- data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
- data/lib/treat/lexicalizers/tag/brill.rb +35 -40
- data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
- data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
- data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
- data/lib/treat/processors.rb +8 -8
- data/lib/treat/processors/chunkers/txt.rb +4 -4
- data/lib/treat/processors/parsers/enju.rb +114 -99
- data/lib/treat/processors/parsers/stanford.rb +109 -41
- data/lib/treat/processors/segmenters/punkt.rb +17 -18
- data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
- data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
- data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
- data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
- data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
- data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
- data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
- data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
- data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
- data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
- data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
- data/lib/treat/processors/segmenters/stanford.rb +38 -37
- data/lib/treat/processors/segmenters/tactful.rb +5 -4
- data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
- data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
- data/lib/treat/processors/tokenizers/perl.rb +2 -2
- data/lib/treat/processors/tokenizers/punkt.rb +6 -2
- data/lib/treat/processors/tokenizers/stanford.rb +25 -24
- data/lib/treat/processors/tokenizers/tactful.rb +1 -2
- data/lib/treat/proxies.rb +2 -35
- data/lib/treat/registrable.rb +17 -22
- data/lib/treat/sugar.rb +11 -11
- data/lib/treat/tree.rb +27 -17
- data/lib/treat/viewable.rb +29 -0
- data/lib/treat/visitable.rb +1 -1
- data/test/tc_entity.rb +56 -49
- data/test/tc_extractors.rb +41 -18
- data/test/tc_formatters.rb +7 -8
- data/test/tc_inflectors.rb +19 -24
- data/test/tc_lexicalizers.rb +12 -19
- data/test/tc_processors.rb +26 -12
- data/test/tc_resources.rb +2 -7
- data/test/tc_treat.rb +20 -22
- data/test/tc_tree.rb +4 -4
- data/test/tests.rb +3 -5
- data/test/texts.rb +13 -14
- data/tmp/INFO +1 -0
- metadata +78 -158
- data/bin/INFO +0 -1
- data/examples/benchmark.rb +0 -81
- data/examples/keywords.rb +0 -148
- data/lib/treat/detectors.rb +0 -31
- data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
- data/lib/treat/detectors/format/file.rb +0 -36
- data/lib/treat/detectors/language/what_language.rb +0 -29
- data/lib/treat/entities/constituents.rb +0 -15
- data/lib/treat/entities/sentence.rb +0 -8
- data/lib/treat/extractors/named_entity/abner.rb +0 -20
- data/lib/treat/extractors/named_entity/stanford.rb +0 -174
- data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
- data/lib/treat/extractors/time/chronic.rb +0 -20
- data/lib/treat/extractors/time/native.rb +0 -18
- data/lib/treat/formatters/readers/gocr.rb +0 -26
- data/lib/treat/formatters/readers/ocropus.rb +0 -31
- data/lib/treat/formatters/visualizers/html.rb +0 -13
- data/lib/treat/formatters/visualizers/inspect.rb +0 -20
- data/lib/treat/inflectors/declensions/en.rb +0 -18
- data/lib/treat/languages/categories.rb +0 -5
- data/lib/treat/languages/english/categories.rb +0 -23
- data/lib/treat/languages/english/tags.rb +0 -352
- data/lib/treat/languages/xinhua.rb +0 -12
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
- data/lib/treat/string.rb +0 -5
- data/test/tc_detectors.rb +0 -26
data/lib/treat/extractors.rb
CHANGED
@@ -1,43 +1,81 @@
|
|
1
1
|
module Treat
|
2
2
|
# Extractors extract specific information out of texts.
|
3
3
|
module Extractors
|
4
|
+
# Detecs language.
|
5
|
+
module Language
|
6
|
+
extend Group
|
7
|
+
require 'treat/extractors/language/language_extractor.rb'
|
8
|
+
self.type = :annotator
|
9
|
+
self.targets = [:entity]
|
10
|
+
self.default = :what_language
|
11
|
+
end
|
4
12
|
# Extracts the time of an object and annotates it
|
5
13
|
# with specific information regarding time.
|
6
14
|
module Time
|
7
15
|
extend Group
|
8
16
|
self.type = :annotator
|
9
|
-
self.targets = [:
|
17
|
+
self.targets = [:phrase]
|
18
|
+
end
|
19
|
+
# Extracts the time of an object and annotates it
|
20
|
+
# with specific information regarding time.
|
21
|
+
module Date
|
22
|
+
extend Group
|
23
|
+
self.type = :annotator
|
24
|
+
self.targets = [:phrase]
|
10
25
|
end
|
11
26
|
# Extract the topic from a text.
|
12
27
|
module Topics
|
13
28
|
extend Group
|
14
29
|
self.type = :annotator
|
15
|
-
self.targets = [:
|
30
|
+
self.targets = [:document, :zone]
|
16
31
|
end
|
17
|
-
# Extract the
|
32
|
+
# Extract the keywords from a text.
|
33
|
+
module Keywords
|
34
|
+
extend Group
|
35
|
+
self.type = :annotator
|
36
|
+
self.targets = [:document, :zone]
|
37
|
+
end
|
38
|
+
# Extract the topic words from a text.
|
18
39
|
module TopicWords
|
19
40
|
extend Group
|
20
41
|
self.type = :annotator
|
21
|
-
self.targets = [:collection
|
42
|
+
self.targets = [:collection]
|
22
43
|
end
|
23
44
|
# Extract named entities from texts.
|
24
|
-
module
|
45
|
+
module NamedEntityTag
|
25
46
|
extend Group
|
26
|
-
self.type = :
|
27
|
-
self.targets = [:
|
47
|
+
self.type = :annotator
|
48
|
+
self.targets = [:phrase, :word]
|
28
49
|
end
|
29
|
-
# Extract
|
30
|
-
module
|
50
|
+
# Extract named entities from texts.
|
51
|
+
module Coreferences
|
31
52
|
extend Group
|
32
53
|
self.type = :annotator
|
33
|
-
self.targets = [:
|
54
|
+
self.targets = [:zone]
|
34
55
|
end
|
35
56
|
# This module should be moved out of here ASAP.
|
36
57
|
module Statistics
|
37
58
|
extend Group
|
38
59
|
self.type = :annotator
|
39
|
-
self.targets = [:
|
60
|
+
self.targets = [:word]
|
40
61
|
self.default = :none
|
62
|
+
self.preprocessors = {
|
63
|
+
:frequency_in => lambda do |entity, worker, options|
|
64
|
+
options = {:parent => worker}.merge(options)
|
65
|
+
entity.statistics(:frequency_in, options)
|
66
|
+
end,
|
67
|
+
:tf_idf => lambda do |entity, worker, options|
|
68
|
+
entity.statistics(:tf_idf, options)
|
69
|
+
end,
|
70
|
+
:position_in => lambda do |entity, options|
|
71
|
+
entity.statistics(:position_in, options)
|
72
|
+
end
|
73
|
+
}
|
74
|
+
end
|
75
|
+
module Roles
|
76
|
+
extend Group
|
77
|
+
self.type = :annotator
|
78
|
+
self.targets = [:phrase]
|
41
79
|
end
|
42
80
|
extend Treat::Category
|
43
81
|
end
|
@@ -0,0 +1,68 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Coreferences
|
4
|
+
class Stanford
|
5
|
+
require 'stanford-core-nlp'
|
6
|
+
@@pipeline = nil
|
7
|
+
def self.coreferences(entity, options = {})
|
8
|
+
if entity.has_children?
|
9
|
+
warn "The Stanford Coreference Resolver currently requires " +
|
10
|
+
"an unsegmented, untokenized block of text to work with. " +
|
11
|
+
"Removing and replacing all children of '#{entity.short_value}'."
|
12
|
+
entity.remove_all!
|
13
|
+
end
|
14
|
+
@@pipeline ||= ::StanfordCoreNLP.load(
|
15
|
+
:tokenize, :ssplit, :pos,
|
16
|
+
:lemma, :parse, :ner, :dcoref
|
17
|
+
)
|
18
|
+
text = ::StanfordCoreNLP::Text.new(entity.to_s)
|
19
|
+
@@pipeline.annotate(text)
|
20
|
+
clusters = {}
|
21
|
+
text.get(:sentences).each do |sentence|
|
22
|
+
s = Treat::Entities::Sentence.
|
23
|
+
from_string(sentence.get(:value).to_s, true)
|
24
|
+
sentence.get(:tokens).each do |token|
|
25
|
+
t = Treat::Entities::Token.
|
26
|
+
from_string(token.value.to_s)
|
27
|
+
tag = token.get(:named_entity_tag).
|
28
|
+
to_s.downcase
|
29
|
+
corefid = token.get(:coref_cluster_id).to_s
|
30
|
+
unless corefid == ''
|
31
|
+
clusters[corefid] ||= []
|
32
|
+
clusters[corefid] << t
|
33
|
+
t.set :coref_cluster_id, corefid
|
34
|
+
end
|
35
|
+
|
36
|
+
t.set :named_entity_tag,
|
37
|
+
tag.intern unless tag == 'o'
|
38
|
+
s << t
|
39
|
+
end
|
40
|
+
entity << s
|
41
|
+
end
|
42
|
+
entity.each_token do |token|
|
43
|
+
if token.has?(:coref_cluster_id)
|
44
|
+
id = token.coref_cluster_id
|
45
|
+
links = clusters[id].dup
|
46
|
+
links.delete(token)
|
47
|
+
token.unset(:coref_cluster_id)
|
48
|
+
next if links.empty?
|
49
|
+
token.set :coreferents, links
|
50
|
+
links.each do |target|
|
51
|
+
token.link(target, :refers_to)
|
52
|
+
end
|
53
|
+
end
|
54
|
+
end
|
55
|
+
i = 0
|
56
|
+
coreferences = {}
|
57
|
+
clusters.each do |k,v|
|
58
|
+
unless !v || v.size == 1
|
59
|
+
coreferences[i] = v
|
60
|
+
i += 1
|
61
|
+
end
|
62
|
+
end
|
63
|
+
coreferences
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Date
|
4
|
+
# A wrapper for the 'chronic' gem, which parses
|
5
|
+
# date information.
|
6
|
+
#
|
7
|
+
# Project website: http://chronic.rubyforge.org/
|
8
|
+
class Chronic
|
9
|
+
silence_warnings { require 'chronic' }
|
10
|
+
require 'date'
|
11
|
+
# Return the date information contained within the entity
|
12
|
+
# by parsing it with the 'chronic' gem.
|
13
|
+
#
|
14
|
+
# Options: none.
|
15
|
+
def self.date(entity, options = {})
|
16
|
+
date = nil
|
17
|
+
return if entity.has?(:time)
|
18
|
+
s = entity.to_s
|
19
|
+
s.gsub!('\/', '/')
|
20
|
+
s.strip!
|
21
|
+
silence_warnings do
|
22
|
+
date = ::Chronic.parse(s, {:guess => true})
|
23
|
+
end
|
24
|
+
entity.ancestors_with_type(:phrase).each do |a|
|
25
|
+
a.unset(:date) if a.has?(:date)
|
26
|
+
end
|
27
|
+
return date.to_date if date
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Date
|
4
|
+
# A wrapper for Ruby's native date parsing.
|
5
|
+
class Ruby
|
6
|
+
require 'date'
|
7
|
+
# Return a DateTime object representing the date/date
|
8
|
+
# contained within the entity, using Ruby's native
|
9
|
+
# date/date parser.
|
10
|
+
#
|
11
|
+
# Options: none.
|
12
|
+
def self.date(entity, options = {})
|
13
|
+
begin
|
14
|
+
s = entity.to_s.strip
|
15
|
+
s.gsub!('\/', '/')
|
16
|
+
date = ::DateTime.parse(s)
|
17
|
+
date.to_date
|
18
|
+
rescue
|
19
|
+
nil
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Keywords
|
4
|
+
class TfIdf
|
5
|
+
DefaultOptions = { num_keywords: 5 }
|
6
|
+
def self.keywords(entity, options = {})
|
7
|
+
options = DefaultOptions.merge(options)
|
8
|
+
tf_idfs = {}
|
9
|
+
entity.each_word do |word|
|
10
|
+
tf_idfs[word.value] ||= word.tf_idf
|
11
|
+
end
|
12
|
+
tf_idfs = tf_idfs.sort_by {|k,v| v}.reverse
|
13
|
+
return tf_idfs if tf_idfs.size <= options[:num_keywords]
|
14
|
+
keywords = []
|
15
|
+
i = 0
|
16
|
+
tf_idfs.each do |info|
|
17
|
+
break if i > options[:num_keywords]
|
18
|
+
keywords << info[0]
|
19
|
+
i += 1
|
20
|
+
end
|
21
|
+
keywords
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -1,12 +1,12 @@
|
|
1
1
|
module Treat
|
2
2
|
module Extractors
|
3
3
|
module Keywords
|
4
|
-
class
|
5
|
-
DefaultOptions = {tf_idf_threshold:
|
4
|
+
class TopicsTfIdf
|
5
|
+
DefaultOptions = {num_keywords: 5, tf_idf_threshold: 0.5, topic_words: nil}
|
6
6
|
def self.keywords(entity, options = {})
|
7
7
|
options = DefaultOptions.merge(options)
|
8
8
|
unless options[:topic_words]
|
9
|
-
|
9
|
+
options[:topic_words] = entity.parent_collection.topic_words
|
10
10
|
end
|
11
11
|
if Treat::Entities.rank(entity.type) <
|
12
12
|
Treat::Entities.rank(:sentence)
|
@@ -20,21 +20,29 @@ module Treat
|
|
20
20
|
keywords = []
|
21
21
|
entity.each_word do |word|
|
22
22
|
found = false
|
23
|
+
tf_idf = word.tf_idf
|
23
24
|
options[:topic_words].each do |i, topic_words|
|
24
25
|
next if keywords.include?(word.value)
|
25
26
|
if topic_words.include?(word.value)
|
26
27
|
found = true
|
27
|
-
tf_idf
|
28
|
-
if tf_idf < options[:tf_idf_threshold]
|
28
|
+
if tf_idf > options[:tf_idf_threshold]
|
29
29
|
keywords << word.value
|
30
30
|
word.set :is_keyword?, found
|
31
31
|
end
|
32
32
|
end
|
33
33
|
end
|
34
34
|
end
|
35
|
-
|
35
|
+
i = 0
|
36
|
+
# Take a slice of keywords with i elements.
|
37
|
+
selected_keywords = []
|
38
|
+
keywords.each do |keyword|
|
39
|
+
break if i > options[:num_keywords]
|
40
|
+
selected_keywords << keyword
|
41
|
+
i += 1
|
42
|
+
end
|
43
|
+
selected_keywords
|
36
44
|
end
|
37
45
|
end
|
38
46
|
end
|
39
47
|
end
|
40
|
-
end
|
48
|
+
end
|
data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb}
RENAMED
@@ -1,12 +1,15 @@
|
|
1
1
|
module Treat
|
2
|
-
module
|
2
|
+
module Extractors
|
3
3
|
module Language
|
4
4
|
# A generic language detector, which is called before
|
5
5
|
# any language detector and ensures that configuration
|
6
6
|
# options concerning language are enforced (e.g. returns
|
7
7
|
# the default language when Treat.detect_language is false).
|
8
|
-
class
|
8
|
+
class LanguageExtractor
|
9
9
|
def self.language(entity, options = {})
|
10
|
+
if entity.to_s =~ /^[[:digit:]]+$/
|
11
|
+
return Treat.default_language
|
12
|
+
end
|
10
13
|
if Treat.detect_language == false
|
11
14
|
return Treat.default_language
|
12
15
|
else
|
@@ -0,0 +1,49 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Language
|
4
|
+
# Require the 'whatlanguage' gem.
|
5
|
+
silence_warnings { require 'whatlanguage' }
|
6
|
+
String.class_eval { undef :language }
|
7
|
+
DefaultOptions = {
|
8
|
+
:bias => [:eng, :fre, :chi, :ger, :ara, :spa]
|
9
|
+
}
|
10
|
+
# Adaptor for the 'whatlanguage' gem, which
|
11
|
+
# performs probabilistic language detection.
|
12
|
+
# The library works by checking for the presence
|
13
|
+
# of words with bloom filters built from dictionaries
|
14
|
+
# based upon each source language.
|
15
|
+
class WhatLanguage < LanguageExtractor
|
16
|
+
# Keep only once instance of the gem class.
|
17
|
+
@@detector = nil
|
18
|
+
# Detect the language of an entity using the
|
19
|
+
# 'whatlanguage' gem. Return an identifier
|
20
|
+
# corresponding to the ISO-639-2 code for the
|
21
|
+
# language.
|
22
|
+
#
|
23
|
+
# Options:
|
24
|
+
# - (Array of Symbols) bias => Languages to bias
|
25
|
+
# toward when more than one language is detected
|
26
|
+
# with equal probability.
|
27
|
+
def self.language(entity, options = {})
|
28
|
+
options = DefaultOptions.merge(options)
|
29
|
+
predetection = super(entity, options)
|
30
|
+
return predetection if predetection
|
31
|
+
@@detector ||= ::WhatLanguage.new(:possibilities)
|
32
|
+
possibilities = @@detector.process_text(entity.to_s)
|
33
|
+
lang = {}
|
34
|
+
possibilities.each do |k,v|
|
35
|
+
lang[Treat::Languages.code(k)] = v
|
36
|
+
end
|
37
|
+
max = lang.values.max
|
38
|
+
ordered = lang.select { |i,j| j == max }.keys
|
39
|
+
ordered.each do |l|
|
40
|
+
if options[:bias].include?(l)
|
41
|
+
return l
|
42
|
+
end
|
43
|
+
end
|
44
|
+
return ordered.first
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
49
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module NamedEntityTag
|
4
|
+
class Stanford
|
5
|
+
require 'stanford-core-nlp'
|
6
|
+
StanfordCoreNLP.load_class('ArrayList', 'java.util')
|
7
|
+
StanfordCoreNLP.load_class('Word', 'edu.stanford.nlp.ling')
|
8
|
+
@@pipeline = nil
|
9
|
+
def self.named_entity_tag(entity, options = {})
|
10
|
+
pp = nil
|
11
|
+
if entity.is_a?(Treat::Entities::Token) &&
|
12
|
+
entity.has_parent?
|
13
|
+
pp = entity.parent_phrase
|
14
|
+
s = get_list(pp.tokens)
|
15
|
+
else
|
16
|
+
s = entity.to_s
|
17
|
+
end
|
18
|
+
|
19
|
+
@@pipeline ||= ::StanfordCoreNLP.load(
|
20
|
+
:tokenize, :ssplit, :pos, :lemma, :parse, :ner
|
21
|
+
)
|
22
|
+
|
23
|
+
text = ::StanfordCoreNLP::Text.new(s)
|
24
|
+
@@pipeline.annotate(text)
|
25
|
+
|
26
|
+
add_to = pp ? pp : entity
|
27
|
+
|
28
|
+
if entity.is_a?(Treat::Entities::Phrase)
|
29
|
+
text.get(:tokens).each do |token|
|
30
|
+
t = Treat::Entities::Token.from_string(token.value.to_s)
|
31
|
+
tag = token.get(:named_entity_tag).to_s.downcase
|
32
|
+
t.set :named_entity_tag, tag.intern unless tag == 'o'
|
33
|
+
add_to << t
|
34
|
+
end
|
35
|
+
elsif entity.is_a?(Treat::Entities::Token)
|
36
|
+
tag = text.get(:tokens).iterator.next.
|
37
|
+
get(:named_entity_tag).to_s.downcase
|
38
|
+
entity.set :named_entity_tag, tag.intern unless tag == 'o'
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
def self.get_list(words)
|
44
|
+
list = StanfordCoreNLP::ArrayList.new
|
45
|
+
words.each do |w|
|
46
|
+
list.add(StanfordCoreNLP::Word.new(w.to_s))
|
47
|
+
end
|
48
|
+
list
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,73 @@
|
|
1
|
+
module Treat
|
2
|
+
module Extractors
|
3
|
+
module Roles
|
4
|
+
class Naive
|
5
|
+
def self.roles(entity, options = {})
|
6
|
+
v = main_verb(entity, options)
|
7
|
+
return Treat::Features::Roles.new unless (v && v.has?(:voice))
|
8
|
+
o = object(v, options)
|
9
|
+
s = subject(v, options)
|
10
|
+
if v.voice == 'active'
|
11
|
+
p = o
|
12
|
+
elsif v.voice == 'passive'
|
13
|
+
p = s
|
14
|
+
elsif v.has_feature?(:aux)
|
15
|
+
p = s
|
16
|
+
end
|
17
|
+
p.set :is_patient?, true if p
|
18
|
+
if v.voice == 'active'
|
19
|
+
a = s
|
20
|
+
elsif v.voice == 'passive'
|
21
|
+
#a = object(entity, options)
|
22
|
+
end
|
23
|
+
a.set :is_agent?, true if a
|
24
|
+
if a && p
|
25
|
+
a.link(p, :agent_of)
|
26
|
+
p.link(a, :patient_of)
|
27
|
+
end
|
28
|
+
# Fix - s, o, v
|
29
|
+
Treat::Features::Roles.new(s, o, v, p, a)
|
30
|
+
end
|
31
|
+
# Return the subject of the sentence|verb.
|
32
|
+
def self.subject(verb, options)
|
33
|
+
args = []
|
34
|
+
return unless verb
|
35
|
+
verb.dependencies.each do |dependency|
|
36
|
+
args << verb.root.find(dependency.target)
|
37
|
+
end
|
38
|
+
s = args[0]
|
39
|
+
s.set :is_subject?, true if s
|
40
|
+
s
|
41
|
+
end
|
42
|
+
# Return the object of the sentence|verb.
|
43
|
+
def self.object(verb, options)
|
44
|
+
return if verb.has?(:voice) && verb.voice == 'passive'
|
45
|
+
args = []
|
46
|
+
verb.dependencies.each do |dependency|
|
47
|
+
args << verb.root.find(dependency.target)
|
48
|
+
end
|
49
|
+
o = args[1]
|
50
|
+
return unless o
|
51
|
+
if o.tag == 'NP'
|
52
|
+
b = o
|
53
|
+
else
|
54
|
+
b = o.phrases_with_tag('NP')[0]
|
55
|
+
end
|
56
|
+
b.set :is_object?, true if b
|
57
|
+
b
|
58
|
+
end
|
59
|
+
# Find the main verb (shallowest verb in the tree).
|
60
|
+
def self.main_verb(entity, options)
|
61
|
+
verbs = entity.verbs
|
62
|
+
if verbs.size == 0
|
63
|
+
return
|
64
|
+
end
|
65
|
+
verbs.sort! { |a,b| a.depth <=> b.depth }
|
66
|
+
v = verbs[0]
|
67
|
+
v.set :is_main_verb?, true if v
|
68
|
+
v
|
69
|
+
end
|
70
|
+
end
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|