treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -1,43 +1,81 @@
1
1
  module Treat
2
2
  # Extractors extract specific information out of texts.
3
3
  module Extractors
4
+ # Detecs language.
5
+ module Language
6
+ extend Group
7
+ require 'treat/extractors/language/language_extractor.rb'
8
+ self.type = :annotator
9
+ self.targets = [:entity]
10
+ self.default = :what_language
11
+ end
4
12
  # Extracts the time of an object and annotates it
5
13
  # with specific information regarding time.
6
14
  module Time
7
15
  extend Group
8
16
  self.type = :annotator
9
- self.targets = [:sentence, :word, :constituent, :symbol]
17
+ self.targets = [:phrase]
18
+ end
19
+ # Extracts the time of an object and annotates it
20
+ # with specific information regarding time.
21
+ module Date
22
+ extend Group
23
+ self.type = :annotator
24
+ self.targets = [:phrase]
10
25
  end
11
26
  # Extract the topic from a text.
12
27
  module Topics
13
28
  extend Group
14
29
  self.type = :annotator
15
- self.targets = [:collection, :document, :zone, :sentence]
30
+ self.targets = [:document, :zone]
16
31
  end
17
- # Extract the topic from a text.
32
+ # Extract the keywords from a text.
33
+ module Keywords
34
+ extend Group
35
+ self.type = :annotator
36
+ self.targets = [:document, :zone]
37
+ end
38
+ # Extract the topic words from a text.
18
39
  module TopicWords
19
40
  extend Group
20
41
  self.type = :annotator
21
- self.targets = [:collection, :document, :zone, :sentence]
42
+ self.targets = [:collection]
22
43
  end
23
44
  # Extract named entities from texts.
24
- module NamedEntity
45
+ module NamedEntityTag
25
46
  extend Group
26
- self.type = :computer
27
- self.targets = [:entity]
47
+ self.type = :annotator
48
+ self.targets = [:phrase, :word]
28
49
  end
29
- # Extract the key sentences from a text.
30
- module Keywords
50
+ # Extract named entities from texts.
51
+ module Coreferences
31
52
  extend Group
32
53
  self.type = :annotator
33
- self.targets = [:collection, :document, :zone, :sentence]
54
+ self.targets = [:zone]
34
55
  end
35
56
  # This module should be moved out of here ASAP.
36
57
  module Statistics
37
58
  extend Group
38
59
  self.type = :annotator
39
- self.targets = [:entity]
60
+ self.targets = [:word]
40
61
  self.default = :none
62
+ self.preprocessors = {
63
+ :frequency_in => lambda do |entity, worker, options|
64
+ options = {:parent => worker}.merge(options)
65
+ entity.statistics(:frequency_in, options)
66
+ end,
67
+ :tf_idf => lambda do |entity, worker, options|
68
+ entity.statistics(:tf_idf, options)
69
+ end,
70
+ :position_in => lambda do |entity, options|
71
+ entity.statistics(:position_in, options)
72
+ end
73
+ }
74
+ end
75
+ module Roles
76
+ extend Group
77
+ self.type = :annotator
78
+ self.targets = [:phrase]
41
79
  end
42
80
  extend Treat::Category
43
81
  end
@@ -0,0 +1,68 @@
1
+ module Treat
2
+ module Extractors
3
+ module Coreferences
4
+ class Stanford
5
+ require 'stanford-core-nlp'
6
+ @@pipeline = nil
7
+ def self.coreferences(entity, options = {})
8
+ if entity.has_children?
9
+ warn "The Stanford Coreference Resolver currently requires " +
10
+ "an unsegmented, untokenized block of text to work with. " +
11
+ "Removing and replacing all children of '#{entity.short_value}'."
12
+ entity.remove_all!
13
+ end
14
+ @@pipeline ||= ::StanfordCoreNLP.load(
15
+ :tokenize, :ssplit, :pos,
16
+ :lemma, :parse, :ner, :dcoref
17
+ )
18
+ text = ::StanfordCoreNLP::Text.new(entity.to_s)
19
+ @@pipeline.annotate(text)
20
+ clusters = {}
21
+ text.get(:sentences).each do |sentence|
22
+ s = Treat::Entities::Sentence.
23
+ from_string(sentence.get(:value).to_s, true)
24
+ sentence.get(:tokens).each do |token|
25
+ t = Treat::Entities::Token.
26
+ from_string(token.value.to_s)
27
+ tag = token.get(:named_entity_tag).
28
+ to_s.downcase
29
+ corefid = token.get(:coref_cluster_id).to_s
30
+ unless corefid == ''
31
+ clusters[corefid] ||= []
32
+ clusters[corefid] << t
33
+ t.set :coref_cluster_id, corefid
34
+ end
35
+
36
+ t.set :named_entity_tag,
37
+ tag.intern unless tag == 'o'
38
+ s << t
39
+ end
40
+ entity << s
41
+ end
42
+ entity.each_token do |token|
43
+ if token.has?(:coref_cluster_id)
44
+ id = token.coref_cluster_id
45
+ links = clusters[id].dup
46
+ links.delete(token)
47
+ token.unset(:coref_cluster_id)
48
+ next if links.empty?
49
+ token.set :coreferents, links
50
+ links.each do |target|
51
+ token.link(target, :refers_to)
52
+ end
53
+ end
54
+ end
55
+ i = 0
56
+ coreferences = {}
57
+ clusters.each do |k,v|
58
+ unless !v || v.size == 1
59
+ coreferences[i] = v
60
+ i += 1
61
+ end
62
+ end
63
+ coreferences
64
+ end
65
+ end
66
+ end
67
+ end
68
+ end
@@ -0,0 +1,32 @@
1
+ module Treat
2
+ module Extractors
3
+ module Date
4
+ # A wrapper for the 'chronic' gem, which parses
5
+ # date information.
6
+ #
7
+ # Project website: http://chronic.rubyforge.org/
8
+ class Chronic
9
+ silence_warnings { require 'chronic' }
10
+ require 'date'
11
+ # Return the date information contained within the entity
12
+ # by parsing it with the 'chronic' gem.
13
+ #
14
+ # Options: none.
15
+ def self.date(entity, options = {})
16
+ date = nil
17
+ return if entity.has?(:time)
18
+ s = entity.to_s
19
+ s.gsub!('\/', '/')
20
+ s.strip!
21
+ silence_warnings do
22
+ date = ::Chronic.parse(s, {:guess => true})
23
+ end
24
+ entity.ancestors_with_type(:phrase).each do |a|
25
+ a.unset(:date) if a.has?(:date)
26
+ end
27
+ return date.to_date if date
28
+ end
29
+ end
30
+ end
31
+ end
32
+ end
@@ -0,0 +1,25 @@
1
+ module Treat
2
+ module Extractors
3
+ module Date
4
+ # A wrapper for Ruby's native date parsing.
5
+ class Ruby
6
+ require 'date'
7
+ # Return a DateTime object representing the date/date
8
+ # contained within the entity, using Ruby's native
9
+ # date/date parser.
10
+ #
11
+ # Options: none.
12
+ def self.date(entity, options = {})
13
+ begin
14
+ s = entity.to_s.strip
15
+ s.gsub!('\/', '/')
16
+ date = ::DateTime.parse(s)
17
+ date.to_date
18
+ rescue
19
+ nil
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,26 @@
1
+ module Treat
2
+ module Extractors
3
+ module Keywords
4
+ class TfIdf
5
+ DefaultOptions = { num_keywords: 5 }
6
+ def self.keywords(entity, options = {})
7
+ options = DefaultOptions.merge(options)
8
+ tf_idfs = {}
9
+ entity.each_word do |word|
10
+ tf_idfs[word.value] ||= word.tf_idf
11
+ end
12
+ tf_idfs = tf_idfs.sort_by {|k,v| v}.reverse
13
+ return tf_idfs if tf_idfs.size <= options[:num_keywords]
14
+ keywords = []
15
+ i = 0
16
+ tf_idfs.each do |info|
17
+ break if i > options[:num_keywords]
18
+ keywords << info[0]
19
+ i += 1
20
+ end
21
+ keywords
22
+ end
23
+ end
24
+ end
25
+ end
26
+ end
@@ -1,12 +1,12 @@
1
1
  module Treat
2
2
  module Extractors
3
3
  module Keywords
4
- class TopicsFrequency
5
- DefaultOptions = {tf_idf_threshold: 180, topic_words: nil}
4
+ class TopicsTfIdf
5
+ DefaultOptions = {num_keywords: 5, tf_idf_threshold: 0.5, topic_words: nil}
6
6
  def self.keywords(entity, options = {})
7
7
  options = DefaultOptions.merge(options)
8
8
  unless options[:topic_words]
9
- raise Treat::Exception, "You must supply topic words."
9
+ options[:topic_words] = entity.parent_collection.topic_words
10
10
  end
11
11
  if Treat::Entities.rank(entity.type) <
12
12
  Treat::Entities.rank(:sentence)
@@ -20,21 +20,29 @@ module Treat
20
20
  keywords = []
21
21
  entity.each_word do |word|
22
22
  found = false
23
+ tf_idf = word.tf_idf
23
24
  options[:topic_words].each do |i, topic_words|
24
25
  next if keywords.include?(word.value)
25
26
  if topic_words.include?(word.value)
26
27
  found = true
27
- tf_idf = word.tf_idf
28
- if tf_idf < options[:tf_idf_threshold]
28
+ if tf_idf > options[:tf_idf_threshold]
29
29
  keywords << word.value
30
30
  word.set :is_keyword?, found
31
31
  end
32
32
  end
33
33
  end
34
34
  end
35
- keywords
35
+ i = 0
36
+ # Take a slice of keywords with i elements.
37
+ selected_keywords = []
38
+ keywords.each do |keyword|
39
+ break if i > options[:num_keywords]
40
+ selected_keywords << keyword
41
+ i += 1
42
+ end
43
+ selected_keywords
36
44
  end
37
45
  end
38
46
  end
39
47
  end
40
- end
48
+ end
@@ -1,12 +1,15 @@
1
1
  module Treat
2
- module Detectors
2
+ module Extractors
3
3
  module Language
4
4
  # A generic language detector, which is called before
5
5
  # any language detector and ensures that configuration
6
6
  # options concerning language are enforced (e.g. returns
7
7
  # the default language when Treat.detect_language is false).
8
- class LanguageDetector
8
+ class LanguageExtractor
9
9
  def self.language(entity, options = {})
10
+ if entity.to_s =~ /^[[:digit:]]+$/
11
+ return Treat.default_language
12
+ end
10
13
  if Treat.detect_language == false
11
14
  return Treat.default_language
12
15
  else
@@ -0,0 +1,49 @@
1
+ module Treat
2
+ module Extractors
3
+ module Language
4
+ # Require the 'whatlanguage' gem.
5
+ silence_warnings { require 'whatlanguage' }
6
+ String.class_eval { undef :language }
7
+ DefaultOptions = {
8
+ :bias => [:eng, :fre, :chi, :ger, :ara, :spa]
9
+ }
10
+ # Adaptor for the 'whatlanguage' gem, which
11
+ # performs probabilistic language detection.
12
+ # The library works by checking for the presence
13
+ # of words with bloom filters built from dictionaries
14
+ # based upon each source language.
15
+ class WhatLanguage < LanguageExtractor
16
+ # Keep only once instance of the gem class.
17
+ @@detector = nil
18
+ # Detect the language of an entity using the
19
+ # 'whatlanguage' gem. Return an identifier
20
+ # corresponding to the ISO-639-2 code for the
21
+ # language.
22
+ #
23
+ # Options:
24
+ # - (Array of Symbols) bias => Languages to bias
25
+ # toward when more than one language is detected
26
+ # with equal probability.
27
+ def self.language(entity, options = {})
28
+ options = DefaultOptions.merge(options)
29
+ predetection = super(entity, options)
30
+ return predetection if predetection
31
+ @@detector ||= ::WhatLanguage.new(:possibilities)
32
+ possibilities = @@detector.process_text(entity.to_s)
33
+ lang = {}
34
+ possibilities.each do |k,v|
35
+ lang[Treat::Languages.code(k)] = v
36
+ end
37
+ max = lang.values.max
38
+ ordered = lang.select { |i,j| j == max }.keys
39
+ ordered.each do |l|
40
+ if options[:bias].include?(l)
41
+ return l
42
+ end
43
+ end
44
+ return ordered.first
45
+ end
46
+ end
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,53 @@
1
+ module Treat
2
+ module Extractors
3
+ module NamedEntityTag
4
+ class Stanford
5
+ require 'stanford-core-nlp'
6
+ StanfordCoreNLP.load_class('ArrayList', 'java.util')
7
+ StanfordCoreNLP.load_class('Word', 'edu.stanford.nlp.ling')
8
+ @@pipeline = nil
9
+ def self.named_entity_tag(entity, options = {})
10
+ pp = nil
11
+ if entity.is_a?(Treat::Entities::Token) &&
12
+ entity.has_parent?
13
+ pp = entity.parent_phrase
14
+ s = get_list(pp.tokens)
15
+ else
16
+ s = entity.to_s
17
+ end
18
+
19
+ @@pipeline ||= ::StanfordCoreNLP.load(
20
+ :tokenize, :ssplit, :pos, :lemma, :parse, :ner
21
+ )
22
+
23
+ text = ::StanfordCoreNLP::Text.new(s)
24
+ @@pipeline.annotate(text)
25
+
26
+ add_to = pp ? pp : entity
27
+
28
+ if entity.is_a?(Treat::Entities::Phrase)
29
+ text.get(:tokens).each do |token|
30
+ t = Treat::Entities::Token.from_string(token.value.to_s)
31
+ tag = token.get(:named_entity_tag).to_s.downcase
32
+ t.set :named_entity_tag, tag.intern unless tag == 'o'
33
+ add_to << t
34
+ end
35
+ elsif entity.is_a?(Treat::Entities::Token)
36
+ tag = text.get(:tokens).iterator.next.
37
+ get(:named_entity_tag).to_s.downcase
38
+ entity.set :named_entity_tag, tag.intern unless tag == 'o'
39
+ end
40
+
41
+ end
42
+
43
+ def self.get_list(words)
44
+ list = StanfordCoreNLP::ArrayList.new
45
+ words.each do |w|
46
+ list.add(StanfordCoreNLP::Word.new(w.to_s))
47
+ end
48
+ list
49
+ end
50
+ end
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,73 @@
1
+ module Treat
2
+ module Extractors
3
+ module Roles
4
+ class Naive
5
+ def self.roles(entity, options = {})
6
+ v = main_verb(entity, options)
7
+ return Treat::Features::Roles.new unless (v && v.has?(:voice))
8
+ o = object(v, options)
9
+ s = subject(v, options)
10
+ if v.voice == 'active'
11
+ p = o
12
+ elsif v.voice == 'passive'
13
+ p = s
14
+ elsif v.has_feature?(:aux)
15
+ p = s
16
+ end
17
+ p.set :is_patient?, true if p
18
+ if v.voice == 'active'
19
+ a = s
20
+ elsif v.voice == 'passive'
21
+ #a = object(entity, options)
22
+ end
23
+ a.set :is_agent?, true if a
24
+ if a && p
25
+ a.link(p, :agent_of)
26
+ p.link(a, :patient_of)
27
+ end
28
+ # Fix - s, o, v
29
+ Treat::Features::Roles.new(s, o, v, p, a)
30
+ end
31
+ # Return the subject of the sentence|verb.
32
+ def self.subject(verb, options)
33
+ args = []
34
+ return unless verb
35
+ verb.dependencies.each do |dependency|
36
+ args << verb.root.find(dependency.target)
37
+ end
38
+ s = args[0]
39
+ s.set :is_subject?, true if s
40
+ s
41
+ end
42
+ # Return the object of the sentence|verb.
43
+ def self.object(verb, options)
44
+ return if verb.has?(:voice) && verb.voice == 'passive'
45
+ args = []
46
+ verb.dependencies.each do |dependency|
47
+ args << verb.root.find(dependency.target)
48
+ end
49
+ o = args[1]
50
+ return unless o
51
+ if o.tag == 'NP'
52
+ b = o
53
+ else
54
+ b = o.phrases_with_tag('NP')[0]
55
+ end
56
+ b.set :is_object?, true if b
57
+ b
58
+ end
59
+ # Find the main verb (shallowest verb in the tree).
60
+ def self.main_verb(entity, options)
61
+ verbs = entity.verbs
62
+ if verbs.size == 0
63
+ return
64
+ end
65
+ verbs.sort! { |a,b| a.depth <=> b.depth }
66
+ v = verbs[0]
67
+ v.set :is_main_verb?, true if v
68
+ v
69
+ end
70
+ end
71
+ end
72
+ end
73
+ end