treat 0.1.4 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (160) hide show
  1. data/LICENSE +4 -4
  2. data/TODO +21 -54
  3. data/lib/economist/half_cocked_basel.txt +16 -0
  4. data/lib/economist/hose_and_dry.doc +0 -0
  5. data/lib/economist/hungarys_troubles.abw +70 -0
  6. data/lib/economist/republican_nomination.pdf +0 -0
  7. data/lib/economist/saving_the_euro.odt +0 -0
  8. data/lib/economist/to_infinity_and_beyond.txt +15 -0
  9. data/lib/economist/zero_sum.html +91 -0
  10. data/lib/treat.rb +58 -72
  11. data/lib/treat/buildable.rb +59 -15
  12. data/lib/treat/categories.rb +26 -14
  13. data/lib/treat/category.rb +2 -2
  14. data/lib/treat/delegatable.rb +65 -48
  15. data/lib/treat/doable.rb +44 -0
  16. data/lib/treat/entities.rb +34 -14
  17. data/lib/treat/entities/collection.rb +2 -0
  18. data/lib/treat/entities/document.rb +3 -2
  19. data/lib/treat/entities/entity.rb +105 -90
  20. data/lib/treat/entities/phrases.rb +17 -0
  21. data/lib/treat/entities/tokens.rb +28 -13
  22. data/lib/treat/entities/zones.rb +20 -0
  23. data/lib/treat/extractors.rb +49 -11
  24. data/lib/treat/extractors/coreferences/stanford.rb +68 -0
  25. data/lib/treat/extractors/date/chronic.rb +32 -0
  26. data/lib/treat/extractors/date/ruby.rb +25 -0
  27. data/lib/treat/extractors/keywords/tf_idf.rb +26 -0
  28. data/lib/treat/extractors/keywords/{topics_frequency.rb → topics_tf_idf.rb} +15 -7
  29. data/lib/treat/{detectors/language/language_detector.rb → extractors/language/language_extractor.rb} +5 -2
  30. data/lib/treat/extractors/language/what_language.rb +49 -0
  31. data/lib/treat/extractors/named_entity_tag/stanford.rb +53 -0
  32. data/lib/treat/extractors/roles/naive.rb +73 -0
  33. data/lib/treat/extractors/statistics/frequency_in.rb +6 -13
  34. data/lib/treat/extractors/statistics/{position_in_parent.rb → position_in.rb} +1 -1
  35. data/lib/treat/extractors/statistics/tf_idf.rb +89 -21
  36. data/lib/treat/extractors/statistics/transition_matrix.rb +11 -11
  37. data/lib/treat/extractors/statistics/transition_probability.rb +4 -4
  38. data/lib/treat/extractors/time/nickel.rb +30 -12
  39. data/lib/treat/extractors/topic_words/lda.rb +9 -9
  40. data/lib/treat/extractors/topics/reuters.rb +14 -15
  41. data/lib/treat/extractors/topics/reuters/region.xml +1 -0
  42. data/lib/treat/features.rb +7 -0
  43. data/lib/treat/formatters/readers/abw.rb +6 -1
  44. data/lib/treat/formatters/readers/autoselect.rb +5 -6
  45. data/lib/treat/formatters/readers/doc.rb +3 -1
  46. data/lib/treat/formatters/readers/html.rb +1 -1
  47. data/lib/treat/formatters/readers/image.rb +43 -0
  48. data/lib/treat/formatters/readers/odt.rb +1 -2
  49. data/lib/treat/formatters/readers/pdf.rb +9 -1
  50. data/lib/treat/formatters/readers/xml.rb +40 -0
  51. data/lib/treat/formatters/serializers/xml.rb +50 -14
  52. data/lib/treat/formatters/serializers/yaml.rb +7 -2
  53. data/lib/treat/formatters/unserializers/xml.rb +33 -7
  54. data/lib/treat/formatters/visualizers/dot.rb +90 -20
  55. data/lib/treat/formatters/visualizers/short_value.rb +2 -2
  56. data/lib/treat/formatters/visualizers/standoff.rb +2 -2
  57. data/lib/treat/formatters/visualizers/tree.rb +1 -1
  58. data/lib/treat/formatters/visualizers/txt.rb +13 -4
  59. data/lib/treat/group.rb +16 -10
  60. data/lib/treat/helpers/linguistics_loader.rb +18 -0
  61. data/lib/treat/inflectors.rb +10 -0
  62. data/lib/treat/inflectors/cardinal_words/linguistics.rb +3 -3
  63. data/lib/treat/inflectors/conjugations/linguistics.rb +5 -12
  64. data/lib/treat/inflectors/declensions/english.rb +319 -0
  65. data/lib/treat/inflectors/declensions/linguistics.rb +12 -11
  66. data/lib/treat/inflectors/ordinal_words/linguistics.rb +3 -3
  67. data/lib/treat/install.rb +59 -0
  68. data/lib/treat/kernel.rb +18 -8
  69. data/lib/treat/languages.rb +18 -11
  70. data/lib/treat/languages/arabic.rb +4 -2
  71. data/lib/treat/languages/chinese.rb +6 -2
  72. data/lib/treat/languages/dutch.rb +16 -0
  73. data/lib/treat/languages/english.rb +47 -19
  74. data/lib/treat/languages/french.rb +8 -5
  75. data/lib/treat/languages/german.rb +9 -6
  76. data/lib/treat/languages/greek.rb +16 -0
  77. data/lib/treat/languages/italian.rb +6 -3
  78. data/lib/treat/languages/polish.rb +16 -0
  79. data/lib/treat/languages/portuguese.rb +16 -0
  80. data/lib/treat/languages/russian.rb +16 -0
  81. data/lib/treat/languages/spanish.rb +16 -0
  82. data/lib/treat/languages/swedish.rb +16 -0
  83. data/lib/treat/languages/tags.rb +377 -0
  84. data/lib/treat/lexicalizers.rb +34 -23
  85. data/lib/treat/lexicalizers/category/from_tag.rb +17 -10
  86. data/lib/treat/lexicalizers/linkages/naive.rb +51 -51
  87. data/lib/treat/lexicalizers/synsets/wordnet.rb +5 -1
  88. data/lib/treat/lexicalizers/tag/brill.rb +35 -40
  89. data/lib/treat/lexicalizers/tag/lingua.rb +19 -14
  90. data/lib/treat/lexicalizers/tag/stanford.rb +59 -68
  91. data/lib/treat/lexicalizers/tag/tagger.rb +29 -0
  92. data/lib/treat/processors.rb +8 -8
  93. data/lib/treat/processors/chunkers/txt.rb +4 -4
  94. data/lib/treat/processors/parsers/enju.rb +114 -99
  95. data/lib/treat/processors/parsers/stanford.rb +109 -41
  96. data/lib/treat/processors/segmenters/punkt.rb +17 -18
  97. data/lib/treat/processors/segmenters/punkt/dutch.yaml +9716 -0
  98. data/lib/treat/processors/segmenters/punkt/english.yaml +10340 -0
  99. data/lib/treat/processors/segmenters/punkt/french.yaml +43159 -0
  100. data/lib/treat/processors/segmenters/punkt/german.yaml +9572 -0
  101. data/lib/treat/processors/segmenters/punkt/greek.yaml +6050 -0
  102. data/lib/treat/processors/segmenters/punkt/italian.yaml +14748 -0
  103. data/lib/treat/processors/segmenters/punkt/polish.yaml +9751 -0
  104. data/lib/treat/processors/segmenters/punkt/portuguese.yaml +13662 -0
  105. data/lib/treat/processors/segmenters/punkt/russian.yaml +4237 -0
  106. data/lib/treat/processors/segmenters/punkt/spanish.yaml +24034 -0
  107. data/lib/treat/processors/segmenters/punkt/swedish.yaml +10001 -0
  108. data/lib/treat/processors/segmenters/stanford.rb +38 -37
  109. data/lib/treat/processors/segmenters/tactful.rb +5 -4
  110. data/lib/treat/processors/tokenizers/macintyre.rb +7 -6
  111. data/lib/treat/processors/tokenizers/multilingual.rb +2 -3
  112. data/lib/treat/processors/tokenizers/perl.rb +2 -2
  113. data/lib/treat/processors/tokenizers/punkt.rb +6 -2
  114. data/lib/treat/processors/tokenizers/stanford.rb +25 -24
  115. data/lib/treat/processors/tokenizers/tactful.rb +1 -2
  116. data/lib/treat/proxies.rb +2 -35
  117. data/lib/treat/registrable.rb +17 -22
  118. data/lib/treat/sugar.rb +11 -11
  119. data/lib/treat/tree.rb +27 -17
  120. data/lib/treat/viewable.rb +29 -0
  121. data/lib/treat/visitable.rb +1 -1
  122. data/test/tc_entity.rb +56 -49
  123. data/test/tc_extractors.rb +41 -18
  124. data/test/tc_formatters.rb +7 -8
  125. data/test/tc_inflectors.rb +19 -24
  126. data/test/tc_lexicalizers.rb +12 -19
  127. data/test/tc_processors.rb +26 -12
  128. data/test/tc_resources.rb +2 -7
  129. data/test/tc_treat.rb +20 -22
  130. data/test/tc_tree.rb +4 -4
  131. data/test/tests.rb +3 -5
  132. data/test/texts.rb +13 -14
  133. data/tmp/INFO +1 -0
  134. metadata +78 -158
  135. data/bin/INFO +0 -1
  136. data/examples/benchmark.rb +0 -81
  137. data/examples/keywords.rb +0 -148
  138. data/lib/treat/detectors.rb +0 -31
  139. data/lib/treat/detectors/encoding/r_chardet19.rb +0 -27
  140. data/lib/treat/detectors/format/file.rb +0 -36
  141. data/lib/treat/detectors/language/what_language.rb +0 -29
  142. data/lib/treat/entities/constituents.rb +0 -15
  143. data/lib/treat/entities/sentence.rb +0 -8
  144. data/lib/treat/extractors/named_entity/abner.rb +0 -20
  145. data/lib/treat/extractors/named_entity/stanford.rb +0 -174
  146. data/lib/treat/extractors/statistics/frequency_of.rb +0 -15
  147. data/lib/treat/extractors/time/chronic.rb +0 -20
  148. data/lib/treat/extractors/time/native.rb +0 -18
  149. data/lib/treat/formatters/readers/gocr.rb +0 -26
  150. data/lib/treat/formatters/readers/ocropus.rb +0 -31
  151. data/lib/treat/formatters/visualizers/html.rb +0 -13
  152. data/lib/treat/formatters/visualizers/inspect.rb +0 -20
  153. data/lib/treat/inflectors/declensions/en.rb +0 -18
  154. data/lib/treat/languages/categories.rb +0 -5
  155. data/lib/treat/languages/english/categories.rb +0 -23
  156. data/lib/treat/languages/english/tags.rb +0 -352
  157. data/lib/treat/languages/xinhua.rb +0 -12
  158. data/lib/treat/lexicalizers/synsets/rita_wn.rb +0 -23
  159. data/lib/treat/string.rb +0 -5
  160. data/test/tc_detectors.rb +0 -26
@@ -1,49 +1,50 @@
1
1
  module Treat
2
2
  module Processors
3
3
  module Segmenters
4
- # A wrapper for the sentence splitter supplied by
4
+ # A wrapper for the sentence splitter supplied by
5
5
  # the Stanford parser.
6
6
  class Stanford
7
- # Require the Ruby-Java bridge.
8
- silence_warnings do
9
- require 'rjb'
10
- jar = "#{Treat.bin}/stanford-parser*/stanford-parser*.jar"
11
- jars = Dir.glob(jar)
12
- if jars.empty? || !File.readable?(jars[0])
13
- raise "Could not find stanford parser JAR file (lookin in #{jar})."+
14
- " You may need to manually download the JAR files and/or set Treat.bin."
15
- end
16
- ::Rjb::load(jars[0])
17
- DocumentPreprocessor =
18
- ::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
19
- StringReader = ::Rjb::import('java.io.StringReader')
20
- end
21
- # Segment sentences using the sentence splitter supplied by
22
- # the Stanford parser.
7
+ require 'stanford-core-nlp'
8
+ DefaultOptions = {
9
+ :silence => false,
10
+ :log_to_file => false,
11
+ :also_tokenize => false
12
+ }
13
+ # Segment sentences using the sentence splitter supplied by
14
+ # the Stanford parser. By default, this segmenter also adds
15
+ # the tokens as children of the sentences.
16
+ #
17
+ # Options:
18
+ # - (Boolean) :also_tokenize - Whether to also add the tokens
19
+ # as children of the sentence.
20
+ # - (String) :log_to_file => a filename to log output to
21
+ # instead of displaying it.
22
+ # - (String) :silence => send
23
23
  def self.segment(entity, options = {})
24
- sr = StringReader.new(entity.to_s)
25
- sit = DocumentPreprocessor.new(sr).iterator
26
- while sit.has_next
27
- str = sit.next.to_string
28
- str.gsub!(', ', ' ') # Fix - find better way to implode.
29
- str.gsub!(' \'s', '\'s')
30
- str.gsub!(' .', '.')
31
- str.gsub!(' ,', ',')
32
- str.gsub!(' ;', ';')
33
- str.gsub!(/-[A-Z]{3}-/, '')
34
- str = str[1..-2]
35
- sentence = Entities::Entity.from_string(str)
36
- if options[:tokenize] == true
37
- tit = s.iterator
38
- while tit.has_next
39
- w = tit.next.word
40
- next if w[0] == '-' && w[-1] == '-'
41
- sentence << Entities::Entity.from_string(w)
24
+ options = DefaultOptions.merge(options)
25
+ options[:log_to_file] = '/dev/null' if options[:silence]
26
+ if options[:log_to_file]
27
+ ::StanfordCoreNLP.log_file = options[:log_to_file]
28
+ end
29
+ options = DefaultOptions.merge(options)
30
+ pipeline = ::StanfordCoreNLP.load(:tokenize, :ssplit)
31
+ text = ::StanfordCoreNLP::Text.new(entity.to_s)
32
+ pipeline.annotate(text)
33
+ text.get(:sentences).each do |sentence|
34
+ s = Treat::Entities::Sentence.from_string(sentence.to_s, true)
35
+ entity << s
36
+ if options[:also_tokenize]
37
+ sentence.get(:tokens).each do |token|
38
+ t = Treat::Entities::Phrase.from_string(token.value)
39
+ s << t
40
+ t.set :character_offset_begin,
41
+ token.get(:character_offset_begin)
42
+
43
+ t.set :character_offset_end,
44
+ token.get(:character_offset_end)
42
45
  end
43
46
  end
44
- entity << sentence
45
47
  end
46
- entity
47
48
  end
48
49
  end
49
50
  end
@@ -15,7 +15,7 @@ module Treat
15
15
  class Tactful
16
16
  # Require the 'tactful_tokenizer' gem.
17
17
  silence_warnings { require 'tactful_tokenizer' }
18
- # Somewhere in the depths of the code this is defined...
18
+ # Remove function definition 'tactful_tokenizer' by gem.
19
19
  String.class_eval { undef :tokenize }
20
20
  # Keep only one copy of the segmenter.
21
21
  @@segmenter = nil
@@ -25,11 +25,12 @@ module Treat
25
25
  # Options: none.
26
26
  def self.segment(entity, options = {})
27
27
  @@segmenter ||= TactfulTokenizer::Model.new
28
- sentences = @@segmenter.tokenize_text(entity.to_s)
28
+ s = entity.to_s
29
+ s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
30
+ sentences = @@segmenter.tokenize_text(s)
29
31
  sentences.each do |sentence|
30
- entity << Entities::Entity.from_string(sentence)
32
+ entity << Entities::Phrase.from_string(sentence)
31
33
  end
32
- entity
33
34
  end
34
35
  end
35
36
  end
@@ -16,13 +16,15 @@ module Treat
16
16
  class Macintyre
17
17
  # Tokenize the entity using a native rule-based algorithm.
18
18
  def self.tokenize(entity, options = {})
19
- raise 'Error' if entity.has_children?
20
- chunks = self.split(entity.to_s)
19
+ if entity.has_children?
20
+ raise Treat::Exception,
21
+ 'Cannot tokenize a Phrase that already has children.'
22
+ end
23
+ chunks = split(entity.to_s)
21
24
  chunks.each do |chunk|
22
25
  next if chunk =~ /([[:space:]]+)/
23
- entity << Treat::Entities::Entity.from_string(chunk)
26
+ entity << Treat::Entities::Token.from_string(chunk)
24
27
  end
25
- entity
26
28
  end
27
29
  # Helper method to split the string into tokens.
28
30
  def self.split(string)
@@ -63,8 +65,7 @@ module Treat
63
65
  s.gsub!(/ '([Tt])is /,' \'\1 is ')
64
66
  s.gsub!(/ '([Tt])was /,' \'\1 was ')
65
67
  s.gsub!(/ ([Ww])anna /,' \1an na ')
66
- while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4')
67
- end
68
+ while s.sub!(/(\s)([0-9]+) , ([0-9]+)(\s)/, '\1\2,\3\4'); end
68
69
  s.gsub!(/\//, ' / ')
69
70
  s.gsub!(/\s+/,' ')
70
71
  s.strip!
@@ -14,16 +14,15 @@ module Treat
14
14
  # :language => (Symbol) Force a language for the tokenizer.
15
15
  def self.tokenize(entity, options = {})
16
16
  lang = options[:language] ? options[:language] : entity.language
17
- lang = Treat::Languages.find(lang, 1)
17
+ lang = Treat::Languages.code(lang, 1)
18
18
  if @@tokenizers[lang].nil?
19
19
  @@tokenizers[lang] = ::Tokenizer::Tokenizer.new(lang)
20
20
  end
21
21
  tokens = @@tokenizers[lang].tokenize(entity.to_s)
22
22
  tokens.each do |token|
23
23
  next if token =~ /([[:space:]]+)/
24
- entity << Treat::Entities::Entity.from_string(token)
24
+ entity << Treat::Entities::Token.from_string(token)
25
25
  end
26
- entity
27
26
  end
28
27
  end
29
28
  end
@@ -85,9 +85,9 @@ module Treat
85
85
  #s/ (wan)(na) / $1 $2 /ig;
86
86
  text.gsub!(/ (wan)(na) /i,' \1 \2 ')
87
87
  tokens = text.split(/\s/)
88
- tokens.each do |token|
88
+ tokens[1..-1].each do |token|
89
89
  next if token =~ /([[:space:]]+)/
90
- entity << Treat::Entities::Entity.from_string(token)
90
+ entity << Treat::Entities::Token.from_string(token)
91
91
  end
92
92
  end
93
93
  end
@@ -31,9 +31,13 @@ module Treat
31
31
  # Options: none.
32
32
  def self.tokenize(entity, options = {})
33
33
  entity.to_s.scan(ReWordTokenizer).each do |token|
34
- entity << Treat::Entities::Entity.from_string(token)
34
+ if SentEndChars.include?(token[-1])
35
+ entity << Treat::Entities::Token.from_string(token[0..-2])
36
+ entity << Treat::Entities::Token.from_string(token[-1..-1])
37
+ else
38
+ entity << Treat::Entities::Token.from_string(token)
39
+ end
35
40
  end
36
- entity
37
41
  end
38
42
  end
39
43
  end
@@ -4,34 +4,35 @@ module Treat
4
4
  # A wrapper for the Stanford parser's Penn-Treebank
5
5
  # style tokenizer.
6
6
  class Stanford
7
- # Require the Ruby-Java bridge.
8
- silence_warnings do
9
- require 'rjb'
10
- # Load the Stanford Parser Java files.
11
- jar = "#{Treat.bin}/stanford-parser/stanford-parser.jar"
12
- jars = Dir.glob(jar)
13
- if jars.empty? || !File.readable?(jars[0])
14
- raise "Could not find stanford parser JAR file (looking in #{jar})."+
15
- " You may need to manually download the JAR files and/or set Treat.bin."
16
- end
17
- ::Rjb::load(jars[0])
18
- # Load the Stanford Parser classes.
19
- PTBTokenizer = ::Rjb::import('edu.stanford.nlp.process.PTBTokenizer')
20
- CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
21
- StringReader = ::Rjb::import('java.io.StringReader')
22
- end
7
+ require 'stanford-core-nlp'
8
+ DefaultOptions = {
9
+ :silence => false,
10
+ :log_to_file => nil
11
+ }
12
+ @@tokenizer = nil
23
13
  # Tokenize the entity using a Penn-Treebank style tokenizer
24
14
  # included with the Stanford Parser.
15
+ #
16
+ # Options:
17
+ # - (String) :log_to_file => a filename to log output to
18
+ # instead of displaying it.
25
19
  def self.tokenize(entity, options = {})
26
- ptbt = PTBTokenizer.new(
27
- StringReader.new(entity.to_s),
28
- CoreLabelTokenFactory.new, '')
29
- while ptbt.has_next
30
- w = ptbt.next.word
31
- next if w[0] == '-' && w[-1] == '-'
32
- entity << Treat::Entities::Entity.from_string(w)
20
+ options = DefaultOptions.merge(options)
21
+ options[:log_to_file] = '/dev/null' if options[:silence]
22
+ if options[:log_to_file]
23
+ ::StanfordCoreNLP.log_file = options[:log_to_file]
24
+ end
25
+ @@tokenizer ||= ::StanfordCoreNLP.load(:tokenize)
26
+ text = ::StanfordCoreNLP::Text.new(entity.to_s)
27
+ @@tokenizer.annotate(text)
28
+ text.get(:tokens).each do |token|
29
+ t = Treat::Entities::Token.from_string(token.value)
30
+ entity << t
31
+ t.set :character_offset_begin,
32
+ token.get(:character_offset_begin)
33
+ t.set :character_offset_end,
34
+ token.get(:character_offset_end)
33
35
  end
34
- entity
35
36
  end
36
37
  end
37
38
  end
@@ -49,9 +49,8 @@ module Treat
49
49
  s.gsub!(rules[0], rules[1])
50
50
  end
51
51
  s.split(' ').each do |token|
52
- entity << Entities::Entity.from_string(token)
52
+ entity << Entities::Token.from_string(token)
53
53
  end
54
- entity
55
54
  end
56
55
  end
57
56
  end
@@ -7,7 +7,7 @@ module Treat
7
7
  # Build the entity corresponding to the proxied
8
8
  # object and send the method call to the entity.
9
9
  def method_missing(sym, *args, &block)
10
- if Treat::Categories.have_method?(sym)
10
+ if sym == :do || Treat::Categories.lookup(sym)
11
11
  to_entity.send(sym, *args)
12
12
  else
13
13
  super(sym, *args, &block)
@@ -20,10 +20,6 @@ module Treat
20
20
  # Install Treat functions on String objects.
21
21
  module String
22
22
  include Treat::Proxies::Proxy
23
- # Save the string to the specified file.
24
- def save(file)
25
- File.open(file, 'w') { |f| f.write(self) }
26
- end
27
23
  # Return the entity corresponding to the string.
28
24
  def to_entity
29
25
  Treat::Entities::Entity.from_string(self.to_s)
@@ -34,40 +30,11 @@ module Treat
34
30
  include Treat::Proxies::Proxy
35
31
  # Return the entity corresponding to the number.
36
32
  def to_entity(builder = nil)
37
- Treat::Entities::Entity.from_numeric(self)
38
- end
39
- end
40
- # Install Treat functions on Array objects.
41
- module Array
42
- include Treat::Proxies::Proxy
43
- # The behaviour of this proxy is special:
44
- # if a Treat function is called on an array,
45
- # the function will be called on each element
46
- # of the array and a new array with the
47
- # results will be returned.
48
- def method_missing(sym, *args, &block)
49
- if Category.has_method?(sym)
50
- array = []
51
- each do |element|
52
- if element.is_a? Treat::Entities::Entity
53
- array << element.send(sym, *args)
54
- else
55
- unless [Numeric, String, Array].include?(element.class)
56
- raise Treat::Exception "Cannot convert object with type " +
57
- "#{element.class} into an entity."
58
- end
59
- array << element.to_entity.send(sym, *args)
60
- end
61
- end
62
- array
63
- else
64
- super(sym, *args, &block)
65
- end
33
+ Treat::Entities::Number.from_numeric(self)
66
34
  end
67
35
  end
68
36
  # Include the proxies in the core classes.
69
37
  ::String.class_eval { include Treat::Proxies::String }
70
38
  ::Numeric.class_eval { include Treat::Proxies::Numeric }
71
- ::Array.class_eval { include Treat::Proxies::Array }
72
39
  end
73
40
  end
@@ -1,32 +1,27 @@
1
1
  module Treat
2
2
  module Registrable
3
- # Registers a token in the @token_registry
4
- # hash in the root node.
3
+ # Registers a token in the @token_registry hash.
5
4
  def register_token(token)
6
- if is_root? || type == :document
7
- @token_registry ||= {value: {}, id: {}}
8
- @token_registry[:id][token.id] = token
9
- @token_registry[:value][token.to_s] ||= []
10
- @token_registry[:value][token.to_s] << token
11
- if has_parent? && type == :document
12
- @parent.register_token(token)
13
- end
14
- else
15
- @parent.register_token(token)
16
- end
5
+ @token_registry ||= {:value => {}, :id => {}}
6
+ @token_registry[:id][token.id] = token
7
+ v = token.to_s.downcase
8
+ @token_registry[:value][v] ||= []
9
+ @token_registry[:value][v] << token
10
+ @parent.register_token(token) if has_parent?
17
11
  end
18
- # Find the token registry, which is
19
- # always in the root node.
12
+ # Find the token registry, by default the one
13
+ # in the root node.
20
14
  def token_registry(type = nil)
21
- if self.type == type
22
- @token_registry ||= {value: {}, id: {}}
15
+ if (type == nil && is_root?) || type == self.type
16
+ @token_registry ||= {:value => {}, :id => {}}
23
17
  return @token_registry
24
- end
25
- if has_parent?
26
- @parent.token_registry(type)
27
18
  else
28
- @token_registry ||= {value: {}, id: {}}
29
- @token_registry
19
+ if has_parent?
20
+ @parent.token_registry(type)
21
+ else
22
+ @token_registry ||= {:value => {}, :id => {}}
23
+ @token_registry
24
+ end
30
25
  end
31
26
  end
32
27
  end
@@ -5,9 +5,9 @@ module Treat
5
5
  # Treat::Entities::Word can now be referred to as simply 'Word'.
6
6
  module Sugar
7
7
  # Installs syntactic sugar.
8
- def edulcorate
9
- return if @@edulcorated
10
- @@edulcorated = true
8
+ def sweeten!
9
+ return if @@sweetened
10
+ @@sweetened = true
11
11
  each_entity_class do |type, klass|
12
12
  unless type == :Symbol
13
13
  Object.class_eval do
@@ -18,11 +18,11 @@ module Treat
18
18
  end
19
19
  end
20
20
  end
21
- alias :sweeten :edulcorate
21
+
22
22
  # Uninstalls syntactic sugar.
23
- def unedulcorate
24
- return unless @@edulcorated
25
- @@edulcorated = false
23
+ def unsweeten!
24
+ return unless @@sweetened
25
+ @@sweetened = false
26
26
  each_entity_class do |type, klass|
27
27
  unless type == :Symbol
28
28
  Object.class_eval do
@@ -31,17 +31,17 @@ module Treat
31
31
  end
32
32
  end
33
33
  end
34
- alias :unsweeten :unedulcorate
34
+
35
35
  # Boolean - whether syntactic sugar is
36
36
  # enabled or not.
37
- def edulcorated?; @@edulcorated; end
37
+ def sweetened?; @@sweetened; end
38
38
  # Syntactic sugar is disabled by default.
39
- @@edulcorated = false
39
+ @@sweetened = false
40
40
  private
41
41
  # Helper method, yields each entity type and class.
42
42
  def each_entity_class
43
43
  Treat::Entities.list.each do |entity_type|
44
- type = :"#{cc(entity_type)}"
44
+ type = cc(entity_type).intern
45
45
  klass = Treat::Entities.const_get(type, klass)
46
46
  yield type, klass
47
47
  end
@@ -1,10 +1,10 @@
1
1
  module Treat
2
2
  # This module provides an abstract tree structure with
3
- # nodes having an id, a value, children, features and edges.
3
+ # nodes having an id, a value, children, features and dependencies.
4
4
  module Tree
5
5
  # This class models the nodes for an N-ary tree data structue
6
6
  # with unique identifiers, text value, children, features
7
- # (annotations) and edges.
7
+ # (annotations) and dependencies.
8
8
  #
9
9
  # This class was tightly based on the 'rubytree' gem.
10
10
  # RubyTree is licensed under the BSD license and can
@@ -24,28 +24,30 @@ module Treat
24
24
  attr_reader :children
25
25
  # A hash containing the features of this node.
26
26
  attr_accessor :features
27
- # A hash containing the edges that link this
27
+ # An array containing the dependencies that link this
28
28
  # node to other nodes.
29
- attr_accessor :edges
29
+ attr_accessor :dependencies
30
+ # A struct for dependencies.
31
+ Struct.new('Dependency', :target, :type, :directed, :direction)
30
32
  # The parent of the node.
31
33
  attr_accessor :parent
32
34
  # Initialize the node with its value and id.
33
35
  # Setup containers for the children, features
34
- # and edges of this node.
36
+ # and dependencies of this node.
35
37
  def initialize(value, id = nil)
36
38
  @parent = nil
37
39
  @value, @id = value, id
38
40
  @children = []
39
41
  @children_hash = {}
40
42
  @features = {}
41
- @edges = {}
43
+ @dependencies = []
42
44
  end
43
- # Boolean - does the node have edges?
44
- def has_edges?; !@edges.empty?; end
45
+ # Boolean - does the node have dependencies?
46
+ def has_dependencies?; !(@dependencies.size == 0); end
45
47
  # Boolean - does the node have children?
46
- def has_children?; !@children.empty?; end
48
+ def has_children?; !(@children.size == 0); end
47
49
  # Boolean - does the node have features?
48
- def has_features?; !@features.empty?; end
50
+ def has_features?; !(@features.size == 0); end
49
51
  # Boolean - does the node have a parent?
50
52
  def has_parent?; !@parent.nil?; end
51
53
  # Boolean - does the node not have a parent?
@@ -132,6 +134,10 @@ module Treat
132
134
  @features ||= {}
133
135
  @features[feature] = value
134
136
  end
137
+ # Unset a feature.
138
+ def unset(feature)
139
+ @features.delete(feature)
140
+ end
135
141
  # Return the depth of this node in the tree.
136
142
  def depth
137
143
  return 0 if is_root?
@@ -139,23 +145,26 @@ module Treat
139
145
  end
140
146
  # Does the entity have a feature ?
141
147
  def has_feature?(feature)
142
- @features.has_key?(feature) ||
143
- [:id, :value, :children, :edges].include?(feature)
148
+ (@features.has_key?(feature) &&
149
+ !@features[feature].nil?) ||
150
+ [:id, :value, :children, :dependencies].include?(feature)
144
151
  end
145
152
  alias :has? :has_feature?
146
153
  # Link this node to the target node with
147
- # the supplied edge type.
148
- def associate(id_or_node, edge_type = nil)
149
- if id_or_node.is_a? Treat::Tree::Node
154
+ # the supplied dependency type.
155
+ def link(id_or_node, type = nil, directed = true, direction = 1)
156
+ if id_or_node.is_a?(Treat::Tree::Node)
150
157
  id = root.find(id_or_node).id
151
158
  else
152
159
  id = id_or_node
153
160
  end
154
- @edges[id] = edge_type if id
161
+ @dependencies.each { |d| return if d.target == id }
162
+ @dependencies <<
163
+ Struct::Dependency.new(id, type, directed, direction)
155
164
  end
156
165
  # Find the node in the tree with the given id.
157
166
  def find(id_or_node)
158
- if id_or_node.is_a? self.class
167
+ if id_or_node.is_a?(Treat::Tree::Node)
159
168
  id = id_or_node.id
160
169
  else
161
170
  id = id_or_node
@@ -165,6 +174,7 @@ module Treat
165
174
  r = child.find(id)
166
175
  return r if r.is_a? Tree::Node
167
176
  end
177
+ nil
168
178
  end
169
179
  # Find the root of the tree within which
170
180
  # this node is contained.