treat 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (147) hide show
  1. data/INSTALL +0 -0
  2. data/LICENSE +28 -0
  3. data/README +0 -0
  4. data/TODO +67 -0
  5. data/bin/INFO +1 -0
  6. data/examples/benchmark.rb +81 -0
  7. data/examples/keywords.rb +60 -0
  8. data/examples/texts/bugged_out.txt +26 -0
  9. data/examples/texts/half_cocked_basel.txt +16 -0
  10. data/examples/texts/hedge_funds.txt +24 -0
  11. data/examples/texts/hose_and_dry.txt +19 -0
  12. data/examples/texts/hungarys_troubles.txt +46 -0
  13. data/examples/texts/indias_slowdown.txt +15 -0
  14. data/examples/texts/merkozy_rides_again.txt +24 -0
  15. data/examples/texts/prada_is_not_walmart.txt +9 -0
  16. data/examples/texts/republican_nomination.txt +26 -0
  17. data/examples/texts/to_infinity_and_beyond.txt +15 -0
  18. data/lib/treat.rb +91 -0
  19. data/lib/treat/buildable.rb +115 -0
  20. data/lib/treat/categories.rb +29 -0
  21. data/lib/treat/category.rb +28 -0
  22. data/lib/treat/delegatable.rb +90 -0
  23. data/lib/treat/detectors.rb +28 -0
  24. data/lib/treat/detectors/encoding/native.rb +12 -0
  25. data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
  26. data/lib/treat/detectors/format/file.rb +36 -0
  27. data/lib/treat/detectors/language/language_detector.rb +19 -0
  28. data/lib/treat/detectors/language/what_language.rb +29 -0
  29. data/lib/treat/entities.rb +52 -0
  30. data/lib/treat/entities/collection.rb +19 -0
  31. data/lib/treat/entities/constituents.rb +15 -0
  32. data/lib/treat/entities/document.rb +11 -0
  33. data/lib/treat/entities/entity.rb +242 -0
  34. data/lib/treat/entities/sentence.rb +8 -0
  35. data/lib/treat/entities/text.rb +7 -0
  36. data/lib/treat/entities/tokens.rb +37 -0
  37. data/lib/treat/entities/zones.rb +17 -0
  38. data/lib/treat/exception.rb +5 -0
  39. data/lib/treat/extractors.rb +41 -0
  40. data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
  41. data/lib/treat/extractors/named_entity/abner.rb +20 -0
  42. data/lib/treat/extractors/named_entity/stanford.rb +174 -0
  43. data/lib/treat/extractors/statistics/frequency.rb +22 -0
  44. data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
  45. data/lib/treat/extractors/statistics/position_in.rb +13 -0
  46. data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
  47. data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
  48. data/lib/treat/extractors/time/chronic.rb +12 -0
  49. data/lib/treat/extractors/time/native.rb +12 -0
  50. data/lib/treat/extractors/time/nickel.rb +45 -0
  51. data/lib/treat/extractors/topic_words/lda.rb +71 -0
  52. data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
  53. data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
  54. data/lib/treat/extractors/topics/reuters.rb +91 -0
  55. data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
  56. data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
  57. data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
  58. data/lib/treat/feature.rb +53 -0
  59. data/lib/treat/formatters.rb +44 -0
  60. data/lib/treat/formatters/cleaners/html.rb +17 -0
  61. data/lib/treat/formatters/readers/autoselect.rb +35 -0
  62. data/lib/treat/formatters/readers/gocr.rb +24 -0
  63. data/lib/treat/formatters/readers/html.rb +13 -0
  64. data/lib/treat/formatters/readers/ocropus.rb +31 -0
  65. data/lib/treat/formatters/readers/pdf.rb +17 -0
  66. data/lib/treat/formatters/readers/txt.rb +15 -0
  67. data/lib/treat/formatters/serializers/xml.rb +48 -0
  68. data/lib/treat/formatters/serializers/yaml.rb +15 -0
  69. data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
  70. data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
  71. data/lib/treat/formatters/unserializers/xml.rb +79 -0
  72. data/lib/treat/formatters/unserializers/yaml.rb +15 -0
  73. data/lib/treat/formatters/visualizers/dot.rb +73 -0
  74. data/lib/treat/formatters/visualizers/html.rb +12 -0
  75. data/lib/treat/formatters/visualizers/inspect.rb +16 -0
  76. data/lib/treat/formatters/visualizers/short_value.rb +14 -0
  77. data/lib/treat/formatters/visualizers/standoff.rb +41 -0
  78. data/lib/treat/formatters/visualizers/tree.rb +28 -0
  79. data/lib/treat/formatters/visualizers/txt.rb +31 -0
  80. data/lib/treat/group.rb +96 -0
  81. data/lib/treat/inflectors.rb +50 -0
  82. data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
  83. data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
  84. data/lib/treat/inflectors/declensors/en.rb +18 -0
  85. data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
  86. data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
  87. data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
  88. data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
  89. data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
  90. data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
  91. data/lib/treat/inflectors/stemmers/porter.rb +158 -0
  92. data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
  93. data/lib/treat/inflectors/stemmers/uea.rb +30 -0
  94. data/lib/treat/lexicalizers.rb +49 -0
  95. data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
  96. data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
  97. data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
  98. data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
  99. data/lib/treat/lexicalizers/tag/brill.rb +101 -0
  100. data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
  101. data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
  102. data/lib/treat/processors.rb +45 -0
  103. data/lib/treat/processors/chunkers/txt.rb +27 -0
  104. data/lib/treat/processors/parsers/enju.rb +214 -0
  105. data/lib/treat/processors/parsers/stanford.rb +60 -0
  106. data/lib/treat/processors/segmenters/punkt.rb +48 -0
  107. data/lib/treat/processors/segmenters/stanford.rb +45 -0
  108. data/lib/treat/processors/segmenters/tactful.rb +34 -0
  109. data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
  110. data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
  111. data/lib/treat/processors/tokenizers/perl.rb +96 -0
  112. data/lib/treat/processors/tokenizers/punkt.rb +42 -0
  113. data/lib/treat/processors/tokenizers/stanford.rb +33 -0
  114. data/lib/treat/processors/tokenizers/tactful.rb +59 -0
  115. data/lib/treat/proxies.rb +66 -0
  116. data/lib/treat/registrable.rb +26 -0
  117. data/lib/treat/resources.rb +10 -0
  118. data/lib/treat/resources/categories.rb +18 -0
  119. data/lib/treat/resources/delegates.rb +96 -0
  120. data/lib/treat/resources/dependencies.rb +0 -0
  121. data/lib/treat/resources/edges.rb +8 -0
  122. data/lib/treat/resources/formats.rb +23 -0
  123. data/lib/treat/resources/languages.rb +86 -0
  124. data/lib/treat/resources/languages.txt +504 -0
  125. data/lib/treat/resources/tags.rb +393 -0
  126. data/lib/treat/sugar.rb +43 -0
  127. data/lib/treat/tree.rb +174 -0
  128. data/lib/treat/utilities.rb +127 -0
  129. data/lib/treat/visitable.rb +27 -0
  130. data/test/profile.rb +2 -0
  131. data/test/tc_detectors.rb +27 -0
  132. data/test/tc_entity.rb +105 -0
  133. data/test/tc_extractors.rb +48 -0
  134. data/test/tc_formatters.rb +46 -0
  135. data/test/tc_inflectors.rb +39 -0
  136. data/test/tc_lexicalizers.rb +39 -0
  137. data/test/tc_processors.rb +36 -0
  138. data/test/tc_resources.rb +27 -0
  139. data/test/tc_treat.rb +64 -0
  140. data/test/tc_tree.rb +60 -0
  141. data/test/tests.rb +19 -0
  142. data/test/texts.rb +20 -0
  143. data/test/texts/english/long.html +24 -0
  144. data/test/texts/english/long.txt +22 -0
  145. data/test/texts/english/medium.txt +5 -0
  146. data/test/texts/english/short.txt +3 -0
  147. metadata +412 -0
@@ -0,0 +1,53 @@
1
+ module Treat
2
+ class Feature
3
+ # Undefine all methods, except those that
4
+ # create any problems (e.g. with serializing).
5
+ instance_methods.each do |meth|
6
+ undef_method(meth) if meth !~
7
+ /^(__|object_id|class|instance_variables|instance_variable_get)/
8
+ end
9
+ # Allows to read the probability hash,
10
+ # the possible values of the feature,
11
+ # and the best value (with highest P).
12
+ attr_reader :p_hash, :values, :best
13
+ # Initialize the feature with a hash
14
+ # of features => probabilities.
15
+ def initialize(p_hash)
16
+ @p_hash = p_hash
17
+ normalize
18
+ max = @p_hash.values.max
19
+ @best = @p_hash.select { |i,j| j == max }.keys.sample
20
+ @values = @p_hash.keys
21
+ type = @values[0].class
22
+ if type == ::Symbol || type == ::NilClass
23
+ @object = @best
24
+ else
25
+ @object = type.new(@best)
26
+ end
27
+ end
28
+ # Normalize the probabilities, so that
29
+ # the sum of all probabilities is one,
30
+ # except if the sum of all probabilities
31
+ # is already below one (in which case we
32
+ # assume that the feature is intentionally
33
+ # incomplete).
34
+ def normalize
35
+ sum = @p_hash.inject(0.0) { |r, e| r + e[1] }
36
+ return if sum <= 1.0
37
+ p = {}
38
+ @p_hash.each { |k,v| p[k] = v.to_f/sum.to_f }
39
+ @p_hash = p
40
+ end
41
+ # Find the probability of value x.
42
+ def probability(x)
43
+ @p_hash[x] ? @p_hash[x] : 0
44
+ end
45
+ # Alias for probability: p(x).
46
+ alias :p :probability
47
+ # Catch all other methods than the ones
48
+ # explicitly defined.
49
+ def method_missing(sym, *args, &block)
50
+ @object.send(sym, *args, &block)
51
+ end
52
+ end
53
+ end
@@ -0,0 +1,44 @@
1
+ module Treat
2
+ # Formatters handle conversion of Entities to and from
3
+ # external file formats.
4
+ module Formatters
5
+ # Readers read a document and create the top-level entity
6
+ # corresponding to the content of the document.
7
+ module Readers
8
+ extend Group
9
+ self.type = :transformer
10
+ self.targets = [:collection, :document]
11
+ self.default = :autoselect
12
+ end
13
+ # Unserializers recreate entities from a serialized format.
14
+ module Unserializers
15
+ extend Group
16
+ self.type = :transformer
17
+ self.targets = [:collection, :document]
18
+ self.default = :autoselect
19
+ end
20
+ # Visualizers transform entities into a visualizable format.
21
+ module Visualizers
22
+ extend Group
23
+ self.type = :computer
24
+ self.targets = [:entity]
25
+ self.default = :tree
26
+ end
27
+ # Serializers transform entities into a storable format.
28
+ module Serializers
29
+ extend Group
30
+ self.type = :computer
31
+ self.targets = [:entity]
32
+ self.default = :yaml
33
+ end
34
+ # Serializers transform entities into a storable format.
35
+ module Cleaners
36
+ extend Group
37
+ self.type = :annotator
38
+ self.targets = [:document]
39
+ self.default = :html
40
+ end
41
+ extend Treat::Category
42
+ end
43
+ end
44
+
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Formatters
3
+ module Cleaners
4
+ class HTML
5
+ silently { require 'hpricot' }
6
+ def self.clean(document, options = {})
7
+ document.each_text do |text|
8
+ text.set :html_value, text.value
9
+ v = Hpricot(text.value).inner_text
10
+ text.value = v
11
+ end
12
+ document
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,35 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ # This class isn't a wrapper for anything.
5
+ # It simply delegates the reading task to
6
+ # the appropriate reader based on the file
7
+ # extension of the supplied document.
8
+ class Autoselect
9
+ # A list of image extensions that should be routed
10
+ # to the Ocropus OCR engine.
11
+ ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
12
+ # Select the appropriate reader based on the format
13
+ # of the filename in document.
14
+ #
15
+ # Options:
16
+ # :ocr => :ocropus | :gocr (the OCR engine to use).
17
+ def self.read(document, options = {:ocr => :ocropus})
18
+ ext = document.file.split('.')[-1]
19
+ if ImageExtensions.include?(ext)
20
+ reader = 'ocropus'
21
+ else
22
+ reader = ext
23
+ end
24
+ begin
25
+ r = Treat::Formatters::Readers.const_get(cc(reader))
26
+ rescue NameError
27
+ raise Treat::Exception,
28
+ "Cannot find a default reader for format: '#{ext}'."
29
+ end
30
+ document = r.read(document, options)
31
+ end
32
+ end
33
+ end
34
+ end
35
+ end
@@ -0,0 +1,24 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ # A wrapper class for the GOCR engine.
5
+ #
6
+ # "GOCR is an OCR (Optical Character Recognition)
7
+ # program, developed under the GNU Public License.
8
+ # It converts scanned images of text back to text files."
9
+ #
10
+ # Project site: http://jocr.sourceforge.net
11
+ class GOCR
12
+ # Read a file using the GOCR reader.
13
+ def self.read(document, options = {})
14
+ create_temp_file(:pgm) do |tmp|
15
+ `convert #{document.file} #{tmp}`
16
+ f = `gocr #{tmp}`.strip
17
+ document << Treat::Entities::Entity.from_string(f)
18
+ end
19
+ document
20
+ end
21
+ end
22
+ end
23
+ end
24
+ end
@@ -0,0 +1,13 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class HTML
5
+ def self.read(document, options = {})
6
+ f = File.read(document.file)
7
+ document << Treat::Entities::Entity.from_string(f)
8
+ document.clean(:html)
9
+ end
10
+ end
11
+ end
12
+ end
13
+ end
@@ -0,0 +1,31 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ # This class is a wrapper for the Google Ocropus
5
+ # optical character recognition (OCR) engine.
6
+ #
7
+ # "OCRopus(tm) is a state-of-the-art document
8
+ # analysis and OCR system, featuring pluggable
9
+ # layout analysis, pluggable character recognition,
10
+ # statistical natural language modeling, and multi-
11
+ # lingual capabilities."
12
+ #
13
+ # Original paper:
14
+ # Breuel, Thomas M. The Ocropus Open Source OCR System.
15
+ # DFKI and U. Kaiserslautern, Germany.
16
+ class Ocropus
17
+ # Read a file using the Google Ocropus reader.
18
+ def self.read(document, options = {})
19
+ create_temp_file(:txt) do |tmp|
20
+ capture(:stderr) do
21
+ `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
22
+ end
23
+ f = File.read(tmp)
24
+ document << Treat::Entities::Entity.from_string(f)
25
+ end
26
+ document
27
+ end
28
+ end
29
+ end
30
+ end
31
+ end
@@ -0,0 +1,17 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ class PDF
5
+ require 'fileutils'
6
+ # Read a file using the Poppler pdf2text utility.
7
+ def self.read(document, options = {})
8
+ create_temp_file(:txt) do |tmp|
9
+ `pdftotext #{document.file} #{tmp} `.strip
10
+ document << Treat::Entities::Entity.from_string(File.read(tmp))
11
+ end
12
+ document
13
+ end
14
+ end
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,15 @@
1
+ module Treat
2
+ module Formatters
3
+ module Readers
4
+ # This class simply reads a plain text file.
5
+ class Txt
6
+ # Build an entity from a string in plain text format.
7
+ def self.read(document, options = {})
8
+ f = File.read(document.file)
9
+ document << Treat::Entities::Entity.from_string(f)
10
+ document
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,48 @@
1
+ module Treat
2
+ module Formatters
3
+ module Serializers
4
+ # This class converts an entity to XML format.
5
+ class XML
6
+ # Reauire the Nokogiri XML parser.
7
+ require 'nokogiri'
8
+ # Serialize an entity tree in XML format.
9
+ def self.serialize(entity, options = {})
10
+ options = {:indent => 0} if options[:indent].nil?
11
+ if options[:indent] == 0
12
+ string = '<?xml version="1.0" encoding="UTF-8" standalone="no" ?>'
13
+ else
14
+ string = ''
15
+ end
16
+ spaces = ''
17
+ options[:indent].times { spaces << ' ' }
18
+ attributes = ''
19
+ if !entity.features.nil? && entity.features.size != 0
20
+ attributes = ' '
21
+ entity.features.each_pair do |feature, value|
22
+ if value.is_a? Entities::Entity
23
+ attributes << "#{feature}='#{value.id}' "
24
+ else
25
+ attributes << "#{feature}='#{value}' "
26
+ end
27
+ end
28
+ entity.edges.each_pair do |id,edge|
29
+ attributes << "#{edge}='#{id}' "
30
+ end
31
+ end
32
+ tag = entity.class.to_s.split('::')[-1].downcase
33
+ string += "\n#{spaces}<#{tag}#{attributes[0..-2]}>"
34
+ if entity.has_children?
35
+ options[:indent] += 1
36
+ entity.children.each do |child|
37
+ string = string + serialize(child, options)
38
+ end
39
+ options[:indent] -= 1
40
+ else
41
+ string = string + "\n#{spaces}#{entity.value}"
42
+ end
43
+ string + "\n#{spaces}</#{tag}>"
44
+ end
45
+ end
46
+ end
47
+ end
48
+ end
@@ -0,0 +1,15 @@
1
+ module Treat
2
+ module Formatters
3
+ module Serializers
4
+ # Require the Psych YAML serializer.
5
+ require 'psych'
6
+ # This class serializes entities in YAML format.
7
+ class YAML
8
+ # Serialize an entity in YAML format.
9
+ def self.serialize(entity, options = {})
10
+ ::Psych.dump(entity)
11
+ end
12
+ end
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,96 @@
1
+ require 'yaml'
2
+ require 'set'
3
+
4
+ class Class
5
+ def persist
6
+ @persist = [] if !@persist
7
+ @persist
8
+ end
9
+
10
+ def persist= p
11
+ @persist = p if p.kind_of?(Array)
12
+ end
13
+
14
+ def persist_with_parent
15
+ p = []
16
+ klass = self;
17
+ while klass
18
+ p.concat(klass.persist)
19
+ klass = klass.superclass
20
+ end
21
+ p.uniq
22
+ end
23
+ end
24
+
25
+ class Object
26
+ def self.persistent *var
27
+ for i in (0..var.length-1)
28
+ var[i] = var[i].to_s
29
+ end
30
+ self.persist.concat(var)
31
+ self.persist.uniq!
32
+ end
33
+
34
+ alias_method :old_to_yaml, :to_yaml
35
+
36
+ def to_yaml ( opts = {} )
37
+ p = self.class.persist_with_parent
38
+
39
+ if p && p.size > 0
40
+ yaml_emit opts do |map|
41
+ p.each do |m|
42
+ map.add( m, instance_variable_get( '@' + m ) )
43
+ end
44
+ end
45
+ else
46
+ old_to_yaml opts
47
+ end
48
+ end
49
+ private
50
+ def yaml_emit opts
51
+ YAML::quick_emit( object_id, opts ) do |out|
52
+ out.map( taguri, to_yaml_style ) do |map|
53
+ yield map
54
+ end
55
+ end
56
+ end
57
+ end
58
+
59
+ module RHNH
60
+ module EnumerablePostDeserializeHelper
61
+ def post_deserialize
62
+ self.each do |e|
63
+ YAML.call_post_deserialize(e) if e
64
+ end
65
+ end
66
+ end
67
+ end
68
+
69
+ class Array
70
+ include RHNH::EnumerablePostDeserializeHelper
71
+ end
72
+
73
+ class Hash
74
+ include RHNH::EnumerablePostDeserializeHelper
75
+ end
76
+
77
+
78
+ module YAML
79
+ def YAML.call_post_deserialize obj, object_map = ::Set.new
80
+ if !object_map.include?(obj.object_id)
81
+ object_map.add(obj.object_id)
82
+
83
+ obj.instance_variables.each do |v|
84
+ call_post_deserialize obj.instance_variable_get(v), object_map
85
+ end
86
+
87
+ obj.post_deserialize if obj.respond_to?('post_deserialize')
88
+ end
89
+ end
90
+
91
+ def YAML.load( io )
92
+ yp = parser.load( io )
93
+ call_post_deserialize yp
94
+ yp
95
+ end
96
+ end
@@ -0,0 +1,19 @@
1
+ module Treat
2
+ module Formatters
3
+ module Unserializers
4
+ class Autoselect
5
+ def self.unserialize(document, options = {})
6
+ ext = document.file.split('.')[-1]
7
+ if ext == 'yaml' || ext == 'yml'
8
+ document.unserialize(:yaml)
9
+ elsif ext == 'xml'
10
+ document.unserialize(:xml)
11
+ else
12
+ raise "File #{document.file} was not recognized"+
13
+ "as a supported serialized format."
14
+ end
15
+ end
16
+ end
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,79 @@
1
+ module Treat
2
+ module Formatters
3
+ module Unserializers
4
+ class XML
5
+ require 'nokogiri'
6
+
7
+ def self.unserialize(document, options = {})
8
+ # Read in the XML file.
9
+ xml = File.read(document.file)
10
+ xml_reader = Nokogiri::XML::Reader.from_memory(xml)
11
+ current_element = nil
12
+ previous_depth = 0
13
+
14
+ # Read the XML file entity by entity.
15
+ while xml_reader.read
16
+ # The depth in the XML tree.
17
+ current_depth = xml_reader.depth
18
+ # If we are at the end of the children stack, pop up.
19
+ if previous_depth > current_depth && current_depth != 0
20
+ current_element = current_element.parent
21
+ end
22
+ # If an end element has been reached,
23
+ # change the depth and pop up on next
24
+ # iteration.
25
+ if xml_reader.node_type ==
26
+ Nokogiri::XML::Reader::TYPE_END_ELEMENT
27
+ previous_depth = current_depth
28
+ next
29
+ end
30
+
31
+ id = nil; value = ''
32
+ attributes = {}; edges = {}
33
+ xml_reader.attributes.each_pair do |k,v|
34
+ if k == 'id'
35
+ id = v
36
+ elsif k == 'edges'
37
+ edges = v
38
+ elsif k == 'value'
39
+ value = v
40
+ else
41
+ attributes[k.intern] = v
42
+ end
43
+ end
44
+
45
+ current_value = ''
46
+ type = xml_reader.name.intern
47
+
48
+ if Treat::Entities.list.include?(type)
49
+ if !current_element
50
+ current_element = self.revive(type, current_value, id)
51
+ else
52
+ current_element = current_element <<
53
+ self.revive(type, current_value, id)
54
+ end
55
+ current_element.features = attributes
56
+ current_element.features = attributes
57
+ current_element.edges = edges
58
+ else
59
+ current_value = xml_reader.value.strip
60
+ if current_value && current_value != ''
61
+ current_element.value = current_value
62
+ end
63
+ end
64
+
65
+ previous_depth = current_depth
66
+ end
67
+ document << current_element
68
+ document
69
+ end
70
+
71
+ def self.revive(type, value, id)
72
+ klass = Treat::Entities.const_get(cc(type))
73
+ klass.new(value, id)
74
+ end
75
+
76
+ end
77
+ end
78
+ end
79
+ end