treat 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- data/INSTALL +0 -0
- data/LICENSE +28 -0
- data/README +0 -0
- data/TODO +67 -0
- data/bin/INFO +1 -0
- data/examples/benchmark.rb +81 -0
- data/examples/keywords.rb +60 -0
- data/examples/texts/bugged_out.txt +26 -0
- data/examples/texts/half_cocked_basel.txt +16 -0
- data/examples/texts/hedge_funds.txt +24 -0
- data/examples/texts/hose_and_dry.txt +19 -0
- data/examples/texts/hungarys_troubles.txt +46 -0
- data/examples/texts/indias_slowdown.txt +15 -0
- data/examples/texts/merkozy_rides_again.txt +24 -0
- data/examples/texts/prada_is_not_walmart.txt +9 -0
- data/examples/texts/republican_nomination.txt +26 -0
- data/examples/texts/to_infinity_and_beyond.txt +15 -0
- data/lib/treat.rb +91 -0
- data/lib/treat/buildable.rb +115 -0
- data/lib/treat/categories.rb +29 -0
- data/lib/treat/category.rb +28 -0
- data/lib/treat/delegatable.rb +90 -0
- data/lib/treat/detectors.rb +28 -0
- data/lib/treat/detectors/encoding/native.rb +12 -0
- data/lib/treat/detectors/encoding/r_chardet19.rb +24 -0
- data/lib/treat/detectors/format/file.rb +36 -0
- data/lib/treat/detectors/language/language_detector.rb +19 -0
- data/lib/treat/detectors/language/what_language.rb +29 -0
- data/lib/treat/entities.rb +52 -0
- data/lib/treat/entities/collection.rb +19 -0
- data/lib/treat/entities/constituents.rb +15 -0
- data/lib/treat/entities/document.rb +11 -0
- data/lib/treat/entities/entity.rb +242 -0
- data/lib/treat/entities/sentence.rb +8 -0
- data/lib/treat/entities/text.rb +7 -0
- data/lib/treat/entities/tokens.rb +37 -0
- data/lib/treat/entities/zones.rb +17 -0
- data/lib/treat/exception.rb +5 -0
- data/lib/treat/extractors.rb +41 -0
- data/lib/treat/extractors/key_sentences/topics_frequency.rb +49 -0
- data/lib/treat/extractors/named_entity/abner.rb +20 -0
- data/lib/treat/extractors/named_entity/stanford.rb +174 -0
- data/lib/treat/extractors/statistics/frequency.rb +22 -0
- data/lib/treat/extractors/statistics/frequency_of.rb +17 -0
- data/lib/treat/extractors/statistics/position_in.rb +13 -0
- data/lib/treat/extractors/statistics/transition_matrix.rb +105 -0
- data/lib/treat/extractors/statistics/transition_probability.rb +53 -0
- data/lib/treat/extractors/time/chronic.rb +12 -0
- data/lib/treat/extractors/time/native.rb +12 -0
- data/lib/treat/extractors/time/nickel.rb +45 -0
- data/lib/treat/extractors/topic_words/lda.rb +71 -0
- data/lib/treat/extractors/topic_words/lda/data.dat +46 -0
- data/lib/treat/extractors/topic_words/lda/wiki.yml +121 -0
- data/lib/treat/extractors/topics/reuters.rb +91 -0
- data/lib/treat/extractors/topics/reuters/industry.xml +2717 -0
- data/lib/treat/extractors/topics/reuters/region.xml +13585 -0
- data/lib/treat/extractors/topics/reuters/topics.xml +17977 -0
- data/lib/treat/feature.rb +53 -0
- data/lib/treat/formatters.rb +44 -0
- data/lib/treat/formatters/cleaners/html.rb +17 -0
- data/lib/treat/formatters/readers/autoselect.rb +35 -0
- data/lib/treat/formatters/readers/gocr.rb +24 -0
- data/lib/treat/formatters/readers/html.rb +13 -0
- data/lib/treat/formatters/readers/ocropus.rb +31 -0
- data/lib/treat/formatters/readers/pdf.rb +17 -0
- data/lib/treat/formatters/readers/txt.rb +15 -0
- data/lib/treat/formatters/serializers/xml.rb +48 -0
- data/lib/treat/formatters/serializers/yaml.rb +15 -0
- data/lib/treat/formatters/serializers/yaml/helper.rb +96 -0
- data/lib/treat/formatters/unserializers/autoselect.rb +19 -0
- data/lib/treat/formatters/unserializers/xml.rb +79 -0
- data/lib/treat/formatters/unserializers/yaml.rb +15 -0
- data/lib/treat/formatters/visualizers/dot.rb +73 -0
- data/lib/treat/formatters/visualizers/html.rb +12 -0
- data/lib/treat/formatters/visualizers/inspect.rb +16 -0
- data/lib/treat/formatters/visualizers/short_value.rb +14 -0
- data/lib/treat/formatters/visualizers/standoff.rb +41 -0
- data/lib/treat/formatters/visualizers/tree.rb +28 -0
- data/lib/treat/formatters/visualizers/txt.rb +31 -0
- data/lib/treat/group.rb +96 -0
- data/lib/treat/inflectors.rb +50 -0
- data/lib/treat/inflectors/cardinal_words/linguistics.rb +45 -0
- data/lib/treat/inflectors/conjugators/linguistics.rb +30 -0
- data/lib/treat/inflectors/declensors/en.rb +18 -0
- data/lib/treat/inflectors/declensors/linguistics.rb +30 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma.rb +12 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/Makefile +213 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/elemma.c +68 -0
- data/lib/treat/inflectors/lemmatizers/e_lemma/extconf.rb +6 -0
- data/lib/treat/inflectors/ordinal_words/linguistics.rb +21 -0
- data/lib/treat/inflectors/stemmers/porter.rb +158 -0
- data/lib/treat/inflectors/stemmers/porter_c.rb +23 -0
- data/lib/treat/inflectors/stemmers/uea.rb +30 -0
- data/lib/treat/lexicalizers.rb +49 -0
- data/lib/treat/lexicalizers/category/from_tag.rb +30 -0
- data/lib/treat/lexicalizers/linkages/naive.rb +63 -0
- data/lib/treat/lexicalizers/synsets/rita_wn.rb +23 -0
- data/lib/treat/lexicalizers/synsets/wordnet.rb +72 -0
- data/lib/treat/lexicalizers/tag/brill.rb +101 -0
- data/lib/treat/lexicalizers/tag/lingua.rb +114 -0
- data/lib/treat/lexicalizers/tag/stanford.rb +86 -0
- data/lib/treat/processors.rb +45 -0
- data/lib/treat/processors/chunkers/txt.rb +27 -0
- data/lib/treat/processors/parsers/enju.rb +214 -0
- data/lib/treat/processors/parsers/stanford.rb +60 -0
- data/lib/treat/processors/segmenters/punkt.rb +48 -0
- data/lib/treat/processors/segmenters/stanford.rb +45 -0
- data/lib/treat/processors/segmenters/tactful.rb +34 -0
- data/lib/treat/processors/tokenizers/macintyre.rb +76 -0
- data/lib/treat/processors/tokenizers/multilingual.rb +31 -0
- data/lib/treat/processors/tokenizers/perl.rb +96 -0
- data/lib/treat/processors/tokenizers/punkt.rb +42 -0
- data/lib/treat/processors/tokenizers/stanford.rb +33 -0
- data/lib/treat/processors/tokenizers/tactful.rb +59 -0
- data/lib/treat/proxies.rb +66 -0
- data/lib/treat/registrable.rb +26 -0
- data/lib/treat/resources.rb +10 -0
- data/lib/treat/resources/categories.rb +18 -0
- data/lib/treat/resources/delegates.rb +96 -0
- data/lib/treat/resources/dependencies.rb +0 -0
- data/lib/treat/resources/edges.rb +8 -0
- data/lib/treat/resources/formats.rb +23 -0
- data/lib/treat/resources/languages.rb +86 -0
- data/lib/treat/resources/languages.txt +504 -0
- data/lib/treat/resources/tags.rb +393 -0
- data/lib/treat/sugar.rb +43 -0
- data/lib/treat/tree.rb +174 -0
- data/lib/treat/utilities.rb +127 -0
- data/lib/treat/visitable.rb +27 -0
- data/test/profile.rb +2 -0
- data/test/tc_detectors.rb +27 -0
- data/test/tc_entity.rb +105 -0
- data/test/tc_extractors.rb +48 -0
- data/test/tc_formatters.rb +46 -0
- data/test/tc_inflectors.rb +39 -0
- data/test/tc_lexicalizers.rb +39 -0
- data/test/tc_processors.rb +36 -0
- data/test/tc_resources.rb +27 -0
- data/test/tc_treat.rb +64 -0
- data/test/tc_tree.rb +60 -0
- data/test/tests.rb +19 -0
- data/test/texts.rb +20 -0
- data/test/texts/english/long.html +24 -0
- data/test/texts/english/long.txt +22 -0
- data/test/texts/english/medium.txt +5 -0
- data/test/texts/english/short.txt +3 -0
- metadata +412 -0
@@ -0,0 +1,53 @@
|
|
1
|
+
module Treat
|
2
|
+
class Feature
|
3
|
+
# Undefine all methods, except those that
|
4
|
+
# create any problems (e.g. with serializing).
|
5
|
+
instance_methods.each do |meth|
|
6
|
+
undef_method(meth) if meth !~
|
7
|
+
/^(__|object_id|class|instance_variables|instance_variable_get)/
|
8
|
+
end
|
9
|
+
# Allows to read the probability hash,
|
10
|
+
# the possible values of the feature,
|
11
|
+
# and the best value (with highest P).
|
12
|
+
attr_reader :p_hash, :values, :best
|
13
|
+
# Initialize the feature with a hash
|
14
|
+
# of features => probabilities.
|
15
|
+
def initialize(p_hash)
|
16
|
+
@p_hash = p_hash
|
17
|
+
normalize
|
18
|
+
max = @p_hash.values.max
|
19
|
+
@best = @p_hash.select { |i,j| j == max }.keys.sample
|
20
|
+
@values = @p_hash.keys
|
21
|
+
type = @values[0].class
|
22
|
+
if type == ::Symbol || type == ::NilClass
|
23
|
+
@object = @best
|
24
|
+
else
|
25
|
+
@object = type.new(@best)
|
26
|
+
end
|
27
|
+
end
|
28
|
+
# Normalize the probabilities, so that
|
29
|
+
# the sum of all probabilities is one,
|
30
|
+
# except if the sum of all probabilities
|
31
|
+
# is already below one (in which case we
|
32
|
+
# assume that the feature is intentionally
|
33
|
+
# incomplete).
|
34
|
+
def normalize
|
35
|
+
sum = @p_hash.inject(0.0) { |r, e| r + e[1] }
|
36
|
+
return if sum <= 1.0
|
37
|
+
p = {}
|
38
|
+
@p_hash.each { |k,v| p[k] = v.to_f/sum.to_f }
|
39
|
+
@p_hash = p
|
40
|
+
end
|
41
|
+
# Find the probability of value x.
|
42
|
+
def probability(x)
|
43
|
+
@p_hash[x] ? @p_hash[x] : 0
|
44
|
+
end
|
45
|
+
# Alias for probability: p(x).
|
46
|
+
alias :p :probability
|
47
|
+
# Catch all other methods than the ones
|
48
|
+
# explicitly defined.
|
49
|
+
def method_missing(sym, *args, &block)
|
50
|
+
@object.send(sym, *args, &block)
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,44 @@
|
|
1
|
+
module Treat
|
2
|
+
# Formatters handle conversion of Entities to and from
|
3
|
+
# external file formats.
|
4
|
+
module Formatters
|
5
|
+
# Readers read a document and create the top-level entity
|
6
|
+
# corresponding to the content of the document.
|
7
|
+
module Readers
|
8
|
+
extend Group
|
9
|
+
self.type = :transformer
|
10
|
+
self.targets = [:collection, :document]
|
11
|
+
self.default = :autoselect
|
12
|
+
end
|
13
|
+
# Unserializers recreate entities from a serialized format.
|
14
|
+
module Unserializers
|
15
|
+
extend Group
|
16
|
+
self.type = :transformer
|
17
|
+
self.targets = [:collection, :document]
|
18
|
+
self.default = :autoselect
|
19
|
+
end
|
20
|
+
# Visualizers transform entities into a visualizable format.
|
21
|
+
module Visualizers
|
22
|
+
extend Group
|
23
|
+
self.type = :computer
|
24
|
+
self.targets = [:entity]
|
25
|
+
self.default = :tree
|
26
|
+
end
|
27
|
+
# Serializers transform entities into a storable format.
|
28
|
+
module Serializers
|
29
|
+
extend Group
|
30
|
+
self.type = :computer
|
31
|
+
self.targets = [:entity]
|
32
|
+
self.default = :yaml
|
33
|
+
end
|
34
|
+
# Serializers transform entities into a storable format.
|
35
|
+
module Cleaners
|
36
|
+
extend Group
|
37
|
+
self.type = :annotator
|
38
|
+
self.targets = [:document]
|
39
|
+
self.default = :html
|
40
|
+
end
|
41
|
+
extend Treat::Category
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Cleaners
|
4
|
+
class HTML
|
5
|
+
silently { require 'hpricot' }
|
6
|
+
def self.clean(document, options = {})
|
7
|
+
document.each_text do |text|
|
8
|
+
text.set :html_value, text.value
|
9
|
+
v = Hpricot(text.value).inner_text
|
10
|
+
text.value = v
|
11
|
+
end
|
12
|
+
document
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
# This class isn't a wrapper for anything.
|
5
|
+
# It simply delegates the reading task to
|
6
|
+
# the appropriate reader based on the file
|
7
|
+
# extension of the supplied document.
|
8
|
+
class Autoselect
|
9
|
+
# A list of image extensions that should be routed
|
10
|
+
# to the Ocropus OCR engine.
|
11
|
+
ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
|
12
|
+
# Select the appropriate reader based on the format
|
13
|
+
# of the filename in document.
|
14
|
+
#
|
15
|
+
# Options:
|
16
|
+
# :ocr => :ocropus | :gocr (the OCR engine to use).
|
17
|
+
def self.read(document, options = {:ocr => :ocropus})
|
18
|
+
ext = document.file.split('.')[-1]
|
19
|
+
if ImageExtensions.include?(ext)
|
20
|
+
reader = 'ocropus'
|
21
|
+
else
|
22
|
+
reader = ext
|
23
|
+
end
|
24
|
+
begin
|
25
|
+
r = Treat::Formatters::Readers.const_get(cc(reader))
|
26
|
+
rescue NameError
|
27
|
+
raise Treat::Exception,
|
28
|
+
"Cannot find a default reader for format: '#{ext}'."
|
29
|
+
end
|
30
|
+
document = r.read(document, options)
|
31
|
+
end
|
32
|
+
end
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|
@@ -0,0 +1,24 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
# A wrapper class for the GOCR engine.
|
5
|
+
#
|
6
|
+
# "GOCR is an OCR (Optical Character Recognition)
|
7
|
+
# program, developed under the GNU Public License.
|
8
|
+
# It converts scanned images of text back to text files."
|
9
|
+
#
|
10
|
+
# Project site: http://jocr.sourceforge.net
|
11
|
+
class GOCR
|
12
|
+
# Read a file using the GOCR reader.
|
13
|
+
def self.read(document, options = {})
|
14
|
+
create_temp_file(:pgm) do |tmp|
|
15
|
+
`convert #{document.file} #{tmp}`
|
16
|
+
f = `gocr #{tmp}`.strip
|
17
|
+
document << Treat::Entities::Entity.from_string(f)
|
18
|
+
end
|
19
|
+
document
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
23
|
+
end
|
24
|
+
end
|
@@ -0,0 +1,31 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
# This class is a wrapper for the Google Ocropus
|
5
|
+
# optical character recognition (OCR) engine.
|
6
|
+
#
|
7
|
+
# "OCRopus(tm) is a state-of-the-art document
|
8
|
+
# analysis and OCR system, featuring pluggable
|
9
|
+
# layout analysis, pluggable character recognition,
|
10
|
+
# statistical natural language modeling, and multi-
|
11
|
+
# lingual capabilities."
|
12
|
+
#
|
13
|
+
# Original paper:
|
14
|
+
# Breuel, Thomas M. The Ocropus Open Source OCR System.
|
15
|
+
# DFKI and U. Kaiserslautern, Germany.
|
16
|
+
class Ocropus
|
17
|
+
# Read a file using the Google Ocropus reader.
|
18
|
+
def self.read(document, options = {})
|
19
|
+
create_temp_file(:txt) do |tmp|
|
20
|
+
capture(:stderr) do
|
21
|
+
`ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
|
22
|
+
end
|
23
|
+
f = File.read(tmp)
|
24
|
+
document << Treat::Entities::Entity.from_string(f)
|
25
|
+
end
|
26
|
+
document
|
27
|
+
end
|
28
|
+
end
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
@@ -0,0 +1,17 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
class PDF
|
5
|
+
require 'fileutils'
|
6
|
+
# Read a file using the Poppler pdf2text utility.
|
7
|
+
def self.read(document, options = {})
|
8
|
+
create_temp_file(:txt) do |tmp|
|
9
|
+
`pdftotext #{document.file} #{tmp} `.strip
|
10
|
+
document << Treat::Entities::Entity.from_string(File.read(tmp))
|
11
|
+
end
|
12
|
+
document
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Readers
|
4
|
+
# This class simply reads a plain text file.
|
5
|
+
class Txt
|
6
|
+
# Build an entity from a string in plain text format.
|
7
|
+
def self.read(document, options = {})
|
8
|
+
f = File.read(document.file)
|
9
|
+
document << Treat::Entities::Entity.from_string(f)
|
10
|
+
document
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,48 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Serializers
|
4
|
+
# This class converts an entity to XML format.
|
5
|
+
class XML
|
6
|
+
# Reauire the Nokogiri XML parser.
|
7
|
+
require 'nokogiri'
|
8
|
+
# Serialize an entity tree in XML format.
|
9
|
+
def self.serialize(entity, options = {})
|
10
|
+
options = {:indent => 0} if options[:indent].nil?
|
11
|
+
if options[:indent] == 0
|
12
|
+
string = '<?xml version="1.0" encoding="UTF-8" standalone="no" ?>'
|
13
|
+
else
|
14
|
+
string = ''
|
15
|
+
end
|
16
|
+
spaces = ''
|
17
|
+
options[:indent].times { spaces << ' ' }
|
18
|
+
attributes = ''
|
19
|
+
if !entity.features.nil? && entity.features.size != 0
|
20
|
+
attributes = ' '
|
21
|
+
entity.features.each_pair do |feature, value|
|
22
|
+
if value.is_a? Entities::Entity
|
23
|
+
attributes << "#{feature}='#{value.id}' "
|
24
|
+
else
|
25
|
+
attributes << "#{feature}='#{value}' "
|
26
|
+
end
|
27
|
+
end
|
28
|
+
entity.edges.each_pair do |id,edge|
|
29
|
+
attributes << "#{edge}='#{id}' "
|
30
|
+
end
|
31
|
+
end
|
32
|
+
tag = entity.class.to_s.split('::')[-1].downcase
|
33
|
+
string += "\n#{spaces}<#{tag}#{attributes[0..-2]}>"
|
34
|
+
if entity.has_children?
|
35
|
+
options[:indent] += 1
|
36
|
+
entity.children.each do |child|
|
37
|
+
string = string + serialize(child, options)
|
38
|
+
end
|
39
|
+
options[:indent] -= 1
|
40
|
+
else
|
41
|
+
string = string + "\n#{spaces}#{entity.value}"
|
42
|
+
end
|
43
|
+
string + "\n#{spaces}</#{tag}>"
|
44
|
+
end
|
45
|
+
end
|
46
|
+
end
|
47
|
+
end
|
48
|
+
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Serializers
|
4
|
+
# Require the Psych YAML serializer.
|
5
|
+
require 'psych'
|
6
|
+
# This class serializes entities in YAML format.
|
7
|
+
class YAML
|
8
|
+
# Serialize an entity in YAML format.
|
9
|
+
def self.serialize(entity, options = {})
|
10
|
+
::Psych.dump(entity)
|
11
|
+
end
|
12
|
+
end
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,96 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'set'
|
3
|
+
|
4
|
+
class Class
|
5
|
+
def persist
|
6
|
+
@persist = [] if !@persist
|
7
|
+
@persist
|
8
|
+
end
|
9
|
+
|
10
|
+
def persist= p
|
11
|
+
@persist = p if p.kind_of?(Array)
|
12
|
+
end
|
13
|
+
|
14
|
+
def persist_with_parent
|
15
|
+
p = []
|
16
|
+
klass = self;
|
17
|
+
while klass
|
18
|
+
p.concat(klass.persist)
|
19
|
+
klass = klass.superclass
|
20
|
+
end
|
21
|
+
p.uniq
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
class Object
|
26
|
+
def self.persistent *var
|
27
|
+
for i in (0..var.length-1)
|
28
|
+
var[i] = var[i].to_s
|
29
|
+
end
|
30
|
+
self.persist.concat(var)
|
31
|
+
self.persist.uniq!
|
32
|
+
end
|
33
|
+
|
34
|
+
alias_method :old_to_yaml, :to_yaml
|
35
|
+
|
36
|
+
def to_yaml ( opts = {} )
|
37
|
+
p = self.class.persist_with_parent
|
38
|
+
|
39
|
+
if p && p.size > 0
|
40
|
+
yaml_emit opts do |map|
|
41
|
+
p.each do |m|
|
42
|
+
map.add( m, instance_variable_get( '@' + m ) )
|
43
|
+
end
|
44
|
+
end
|
45
|
+
else
|
46
|
+
old_to_yaml opts
|
47
|
+
end
|
48
|
+
end
|
49
|
+
private
|
50
|
+
def yaml_emit opts
|
51
|
+
YAML::quick_emit( object_id, opts ) do |out|
|
52
|
+
out.map( taguri, to_yaml_style ) do |map|
|
53
|
+
yield map
|
54
|
+
end
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
|
59
|
+
module RHNH
|
60
|
+
module EnumerablePostDeserializeHelper
|
61
|
+
def post_deserialize
|
62
|
+
self.each do |e|
|
63
|
+
YAML.call_post_deserialize(e) if e
|
64
|
+
end
|
65
|
+
end
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
class Array
|
70
|
+
include RHNH::EnumerablePostDeserializeHelper
|
71
|
+
end
|
72
|
+
|
73
|
+
class Hash
|
74
|
+
include RHNH::EnumerablePostDeserializeHelper
|
75
|
+
end
|
76
|
+
|
77
|
+
|
78
|
+
module YAML
|
79
|
+
def YAML.call_post_deserialize obj, object_map = ::Set.new
|
80
|
+
if !object_map.include?(obj.object_id)
|
81
|
+
object_map.add(obj.object_id)
|
82
|
+
|
83
|
+
obj.instance_variables.each do |v|
|
84
|
+
call_post_deserialize obj.instance_variable_get(v), object_map
|
85
|
+
end
|
86
|
+
|
87
|
+
obj.post_deserialize if obj.respond_to?('post_deserialize')
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
def YAML.load( io )
|
92
|
+
yp = parser.load( io )
|
93
|
+
call_post_deserialize yp
|
94
|
+
yp
|
95
|
+
end
|
96
|
+
end
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Unserializers
|
4
|
+
class Autoselect
|
5
|
+
def self.unserialize(document, options = {})
|
6
|
+
ext = document.file.split('.')[-1]
|
7
|
+
if ext == 'yaml' || ext == 'yml'
|
8
|
+
document.unserialize(:yaml)
|
9
|
+
elsif ext == 'xml'
|
10
|
+
document.unserialize(:xml)
|
11
|
+
else
|
12
|
+
raise "File #{document.file} was not recognized"+
|
13
|
+
"as a supported serialized format."
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
module Treat
|
2
|
+
module Formatters
|
3
|
+
module Unserializers
|
4
|
+
class XML
|
5
|
+
require 'nokogiri'
|
6
|
+
|
7
|
+
def self.unserialize(document, options = {})
|
8
|
+
# Read in the XML file.
|
9
|
+
xml = File.read(document.file)
|
10
|
+
xml_reader = Nokogiri::XML::Reader.from_memory(xml)
|
11
|
+
current_element = nil
|
12
|
+
previous_depth = 0
|
13
|
+
|
14
|
+
# Read the XML file entity by entity.
|
15
|
+
while xml_reader.read
|
16
|
+
# The depth in the XML tree.
|
17
|
+
current_depth = xml_reader.depth
|
18
|
+
# If we are at the end of the children stack, pop up.
|
19
|
+
if previous_depth > current_depth && current_depth != 0
|
20
|
+
current_element = current_element.parent
|
21
|
+
end
|
22
|
+
# If an end element has been reached,
|
23
|
+
# change the depth and pop up on next
|
24
|
+
# iteration.
|
25
|
+
if xml_reader.node_type ==
|
26
|
+
Nokogiri::XML::Reader::TYPE_END_ELEMENT
|
27
|
+
previous_depth = current_depth
|
28
|
+
next
|
29
|
+
end
|
30
|
+
|
31
|
+
id = nil; value = ''
|
32
|
+
attributes = {}; edges = {}
|
33
|
+
xml_reader.attributes.each_pair do |k,v|
|
34
|
+
if k == 'id'
|
35
|
+
id = v
|
36
|
+
elsif k == 'edges'
|
37
|
+
edges = v
|
38
|
+
elsif k == 'value'
|
39
|
+
value = v
|
40
|
+
else
|
41
|
+
attributes[k.intern] = v
|
42
|
+
end
|
43
|
+
end
|
44
|
+
|
45
|
+
current_value = ''
|
46
|
+
type = xml_reader.name.intern
|
47
|
+
|
48
|
+
if Treat::Entities.list.include?(type)
|
49
|
+
if !current_element
|
50
|
+
current_element = self.revive(type, current_value, id)
|
51
|
+
else
|
52
|
+
current_element = current_element <<
|
53
|
+
self.revive(type, current_value, id)
|
54
|
+
end
|
55
|
+
current_element.features = attributes
|
56
|
+
current_element.features = attributes
|
57
|
+
current_element.edges = edges
|
58
|
+
else
|
59
|
+
current_value = xml_reader.value.strip
|
60
|
+
if current_value && current_value != ''
|
61
|
+
current_element.value = current_value
|
62
|
+
end
|
63
|
+
end
|
64
|
+
|
65
|
+
previous_depth = current_depth
|
66
|
+
end
|
67
|
+
document << current_element
|
68
|
+
document
|
69
|
+
end
|
70
|
+
|
71
|
+
def self.revive(type, value, id)
|
72
|
+
klass = Treat::Entities.const_get(cc(type))
|
73
|
+
klass.new(value, id)
|
74
|
+
end
|
75
|
+
|
76
|
+
end
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|