treat 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +7 -8
 - data/TODO +16 -13
 - data/examples/keywords.rb +89 -1
 - data/lib/treat/buildable.rb +1 -8
 - data/lib/treat/categories.rb +3 -4
 - data/lib/treat/category.rb +1 -1
 - data/lib/treat/delegatable.rb +1 -1
 - data/lib/treat/detectors/encoding/native.rb +5 -0
 - data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
 - data/lib/treat/detectors/language/language_detector.rb +4 -0
 - data/lib/treat/detectors/language/what_language.rb +4 -4
 - data/lib/treat/detectors.rb +1 -1
 - data/lib/treat/entities/entity.rb +5 -3
 - data/lib/treat/entities/tokens.rb +14 -5
 - data/lib/treat/entities/zones.rb +4 -0
 - data/lib/treat/entities.rb +7 -5
 - data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
 - data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
 - data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
 - data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
 - data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
 - data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
 - data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
 - data/lib/treat/extractors/time/chronic.rb +8 -0
 - data/lib/treat/extractors/time/native.rb +6 -0
 - data/lib/treat/extractors/time/nickel.rb +31 -23
 - data/lib/treat/extractors/topic_words/lda.rb +21 -16
 - data/lib/treat/extractors/topics/reuters.rb +6 -4
 - data/lib/treat/extractors.rb +7 -7
 - data/lib/treat/formatters/readers/abw.rb +32 -0
 - data/lib/treat/formatters/readers/autoselect.rb +13 -11
 - data/lib/treat/formatters/readers/doc.rb +13 -0
 - data/lib/treat/formatters/readers/gocr.rb +2 -0
 - data/lib/treat/formatters/readers/html.rb +21 -1
 - data/lib/treat/formatters/readers/ocropus.rb +3 -3
 - data/lib/treat/formatters/readers/odt.rb +41 -0
 - data/lib/treat/formatters/readers/pdf.rb +5 -2
 - data/lib/treat/formatters/readers/txt.rb +2 -0
 - data/lib/treat/formatters/serializers/xml.rb +3 -2
 - data/lib/treat/formatters/serializers/yaml.rb +2 -0
 - data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
 - data/lib/treat/formatters/unserializers/xml.rb +6 -1
 - data/lib/treat/formatters/unserializers/yaml.rb +5 -1
 - data/lib/treat/formatters/visualizers/dot.rb +35 -37
 - data/lib/treat/formatters/visualizers/html.rb +1 -0
 - data/lib/treat/formatters/visualizers/inspect.rb +4 -0
 - data/lib/treat/formatters/visualizers/short_value.rb +18 -3
 - data/lib/treat/formatters/visualizers/standoff.rb +11 -6
 - data/lib/treat/formatters/visualizers/tree.rb +5 -1
 - data/lib/treat/formatters/visualizers/txt.rb +6 -1
 - data/lib/treat/formatters.rb +1 -1
 - data/lib/treat/group.rb +4 -3
 - data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
 - data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
 - data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
 - data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
 - data/lib/treat/inflectors/stem/porter.rb +6 -2
 - data/lib/treat/inflectors/stem/porter_c.rb +4 -1
 - data/lib/treat/inflectors/stem/uea.rb +4 -4
 - data/lib/treat/languages/english/tags.rb +16 -0
 - data/lib/treat/languages/english.rb +4 -1
 - data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
 - data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
 - data/lib/treat/lexicalizers/tag/brill.rb +3 -11
 - data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
 - data/lib/treat/lexicalizers.rb +0 -2
 - data/lib/treat/processors/chunkers/txt.rb +4 -4
 - data/lib/treat/processors/parsers/enju.rb +3 -17
 - data/lib/treat/processors/parsers/stanford.rb +4 -0
 - data/lib/treat/processors/segmenters/punkt.rb +1 -0
 - data/lib/treat/processors/segmenters/stanford.rb +4 -0
 - data/lib/treat/processors/segmenters/tactful.rb +4 -1
 - data/lib/treat/processors/tokenizers/punkt.rb +1 -2
 - data/lib/treat/processors/tokenizers/stanford.rb +4 -0
 - data/lib/treat/processors/tokenizers/tactful.rb +1 -1
 - data/lib/treat/processors.rb +4 -4
 - data/lib/treat/proxies.rb +18 -11
 - data/lib/treat/registrable.rb +12 -5
 - data/lib/treat/sugar.rb +8 -3
 - data/lib/treat/tree.rb +10 -3
 - data/lib/treat.rb +55 -55
 - data/test/tc_entity.rb +7 -7
 - data/test/tc_extractors.rb +6 -4
 - data/test/tc_formatters.rb +0 -4
 - data/test/tests.rb +2 -0
 - data/test/texts.rb +4 -4
 - metadata +48 -56
 - data/examples/texts/bugged_out.txt +0 -26
 - data/examples/texts/half_cocked_basel.txt +0 -16
 - data/examples/texts/hedge_funds.txt +0 -24
 - data/examples/texts/hose_and_dry.txt +0 -19
 - data/examples/texts/hungarys_troubles.txt +0 -46
 - data/examples/texts/indias_slowdown.txt +0 -15
 - data/examples/texts/merkozy_rides_again.txt +0 -24
 - data/examples/texts/prada_is_not_walmart.txt +0 -9
 - data/examples/texts/republican_nomination.txt +0 -26
 - data/examples/texts/to_infinity_and_beyond.txt +0 -15
 - data/lib/treat/entities/text.rb +0 -7
 - data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
 - data/lib/treat/formatters/cleaners/html.rb +0 -17
 
| 
         @@ -1,6 +1,8 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Extractors
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Statistics
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # Experimental algorithm to calculate the transition
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # probability of an observed word.
         
     | 
| 
       4 
6 
     | 
    
         
             
                  class TransitionProbability
         
     | 
| 
       5 
7 
     | 
    
         | 
| 
       6 
8 
     | 
    
         
             
                    # Find the transition probability.
         
     | 
| 
         @@ -18,14 +20,16 @@ module Treat 
     | 
|
| 
       18 
20 
     | 
    
         
             
                          next unless tm[f1][v1]
         
     | 
| 
       19 
21 
     | 
    
         | 
| 
       20 
22 
     | 
    
         
             
                          relationships.each do |relationship|
         
     | 
| 
       21 
     | 
    
         
            -
                            relatives =  
     | 
| 
      
 23 
     | 
    
         
            +
                            relatives = entity.send(relationship)
         
     | 
| 
       22 
24 
     | 
    
         
             
                            relatives = [relatives] unless relatives.is_a? Array
         
     | 
| 
       23 
25 
     | 
    
         
             
                            relatives.each do |relative|
         
     | 
| 
       24 
26 
     | 
    
         
             
                              next if relative.nil? || !relative.has?(f2)
         
     | 
| 
       25 
27 
     | 
    
         
             
                              v2 = relative.send(f2)
         
     | 
| 
       26 
     | 
    
         
            -
                              if tm[f1][v1][relationship] 
     | 
| 
       27 
     | 
    
         
            -
                                 
     | 
| 
       28 
     | 
    
         
            -
                                 
     | 
| 
      
 28 
     | 
    
         
            +
                              if tm[f1][v1][relationship] && 
         
     | 
| 
      
 29 
     | 
    
         
            +
                                tm[f1][v1][relationship][f2] && 
         
     | 
| 
      
 30 
     | 
    
         
            +
                                tm[f1][v1][relationship][f2][v2]
         
     | 
| 
      
 31 
     | 
    
         
            +
                                  score += tm[f1][v1][relationship][f2][v2]
         
     | 
| 
      
 32 
     | 
    
         
            +
                                  count += 1
         
     | 
| 
       29 
33 
     | 
    
         
             
                              end
         
     | 
| 
       30 
34 
     | 
    
         
             
                            end
         
     | 
| 
       31 
35 
     | 
    
         
             
                          end
         
     | 
| 
         @@ -1,8 +1,16 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Extractors
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Time
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # A wrapper for the 'chronic' gem, which parses
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # time and date information.
         
     | 
| 
      
 6 
     | 
    
         
            +
                  # 
         
     | 
| 
      
 7 
     | 
    
         
            +
                  # Project website: http://chronic.rubyforge.org/
         
     | 
| 
       4 
8 
     | 
    
         
             
                  class Chronic
         
     | 
| 
       5 
9 
     | 
    
         
             
                    silence_warnings { require 'chronic' }
         
     | 
| 
      
 10 
     | 
    
         
            +
                    # Return the time information contained within the entity
         
     | 
| 
      
 11 
     | 
    
         
            +
                    # by parsing it with the 'chronic' gem.
         
     | 
| 
      
 12 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 13 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       6 
14 
     | 
    
         
             
                    def self.time(entity, options = {})
         
     | 
| 
       7 
15 
     | 
    
         
             
                      silence_warnings { ::Chronic.parse(entity.to_s, {:guess => true}) }
         
     | 
| 
       8 
16 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -1,8 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Extractors
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Time
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # A wrapper for Ruby's native date/time parsing.
         
     | 
| 
       4 
5 
     | 
    
         
             
                  module Native
         
     | 
| 
       5 
6 
     | 
    
         
             
                    require 'date'
         
     | 
| 
      
 7 
     | 
    
         
            +
                    # Return a DateTime object representing the date/time
         
     | 
| 
      
 8 
     | 
    
         
            +
                    # contained within the entity, using Ruby's native
         
     | 
| 
      
 9 
     | 
    
         
            +
                    # date/time parser.
         
     | 
| 
      
 10 
     | 
    
         
            +
                    #
         
     | 
| 
      
 11 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       6 
12 
     | 
    
         
             
                    def self.time(entity, options = {})
         
     | 
| 
       7 
13 
     | 
    
         
             
                      ::DateTime.parse(entity.to_s)
         
     | 
| 
       8 
14 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -1,45 +1,53 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Extractors
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Time
         
     | 
| 
       4 
     | 
    
         
            -
             
     | 
| 
       5 
     | 
    
         
            -
                   
     | 
| 
       6 
     | 
    
         
            -
             
     | 
| 
       7 
     | 
    
         
            -
                   
     | 
| 
       8 
     | 
    
         
            -
                  
         
     | 
| 
       9 
     | 
    
         
            -
                   
     | 
| 
       10 
     | 
    
         
            -
                   
     | 
| 
       11 
     | 
    
         
            -
                   
     | 
| 
       12 
     | 
    
         
            -
                   
     | 
| 
       13 
     | 
    
         
            -
                   
     | 
| 
       14 
     | 
    
         
            -
             
     | 
| 
       15 
     | 
    
         
            -
             
     | 
| 
      
 4 
     | 
    
         
            +
                  # A wrapper for the 'nickel' gem, which parses 
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # times and dates and supplies additional information
         
     | 
| 
      
 6 
     | 
    
         
            +
                  # concerning these. The additional information supplied
         
     | 
| 
      
 7 
     | 
    
         
            +
                  # that this class annotates entities with is:
         
     | 
| 
      
 8 
     | 
    
         
            +
                  #
         
     | 
| 
      
 9 
     | 
    
         
            +
                  # - time_recurrence: frequency of recurrence in words*.
         
     | 
| 
      
 10 
     | 
    
         
            +
                  # - time_recurrence_interval: frequency of recurrence in days.
         
     | 
| 
      
 11 
     | 
    
         
            +
                  # - start_time: a DateTime object representing the beginning of
         
     | 
| 
      
 12 
     | 
    
         
            +
                  #   an event.
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # - end_time: a DateTime object representing the end of an event.
         
     | 
| 
      
 14 
     | 
    
         
            +
                  # 
         
     | 
| 
      
 15 
     | 
    
         
            +
                  # Examples of values for time_recurrence are:
         
     | 
| 
      
 16 
     | 
    
         
            +
                  #
         
     | 
| 
      
 17 
     | 
    
         
            +
                  # - single: "lunch with megan tomorrow at noon"
         
     | 
| 
      
 18 
     | 
    
         
            +
                  # - daily: "Art exhibit until March 1st"
         
     | 
| 
      
 19 
     | 
    
         
            +
                  # - weekly: "math class every wed from 8-11am"
         
     | 
| 
      
 20 
     | 
    
         
            +
                  # - daymonthly: "open bar at joes the first friday of every month"
         
     | 
| 
      
 21 
     | 
    
         
            +
                  # - datemonthly: "pay credit card bill on the 22nd of each month"
         
     | 
| 
      
 22 
     | 
    
         
            +
                  # 
         
     | 
| 
      
 23 
     | 
    
         
            +
                  # Project website: http://naturalinputs.com/
         
     | 
| 
       16 
24 
     | 
    
         
             
                  module Nickel
         
     | 
| 
       17 
25 
     | 
    
         
             
                    require 'date'
         
     | 
| 
       18 
26 
     | 
    
         
             
                    silence_warnings { require 'nickel' }
         
     | 
| 
      
 27 
     | 
    
         
            +
                    # Extract time information from a bit of text.
         
     | 
| 
       19 
28 
     | 
    
         
             
                    def self.time(entity, options = {})
         
     | 
| 
       20 
29 
     | 
    
         
             
                      n = silence_warnings { ::Nickel.parse(entity.to_s) }
         
     | 
| 
       21 
30 
     | 
    
         
             
                      occ = n.occurrences[0]
         
     | 
| 
       22 
     | 
    
         
            -
             
     | 
| 
      
 31 
     | 
    
         
            +
             
     | 
| 
       23 
32 
     | 
    
         
             
                      rec = occ.type.to_s.gsub('single', 'once').intern
         
     | 
| 
       24 
33 
     | 
    
         
             
                      entity.set :time_recurrence, rec
         
     | 
| 
       25 
     | 
    
         
            -
                      interval = occ.interval ? occ.interval 
     | 
| 
      
 34 
     | 
    
         
            +
                      interval = occ.interval ? occ.interval : :none
         
     | 
| 
       26 
35 
     | 
    
         
             
                      entity.set :time_recurrence_interval, interval
         
     | 
| 
       27 
     | 
    
         
            -
             
     | 
| 
      
 36 
     | 
    
         
            +
             
     | 
| 
       28 
37 
     | 
    
         
             
                      s = [occ.start_date, occ.start_time]
         
     | 
| 
       29 
38 
     | 
    
         
             
                      ds = [s[0].year, s[0].month, s[0].day] if s[0]
         
     | 
| 
       30 
     | 
    
         
            -
                      ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
         
     | 
| 
       31 
     | 
    
         
            -
             
     | 
| 
      
 39 
     | 
    
         
            +
                      #ts = [s[1].hour, s[1].min, s[1].sec] if s[1]
         
     | 
| 
      
 40 
     | 
    
         
            +
             
     | 
| 
       32 
41 
     | 
    
         
             
                      e = [occ.end_date, occ.end_time]
         
     | 
| 
       33 
42 
     | 
    
         
             
                      de = [e[0].year, e[0].month, e[0].day] if e[0]
         
     | 
| 
       34 
     | 
    
         
            -
                      te = [e[1].hour, e[1].min, e[1].sec] if e[1]
         
     | 
| 
       35 
     | 
    
         
            -
             
     | 
| 
       36 
     | 
    
         
            -
                      entity.set :start_time, ::DateTime.civil(*ds 
     | 
| 
       37 
     | 
    
         
            -
                      entity.set :end_time, ::DateTime.civil(*de 
     | 
| 
       38 
     | 
    
         
            -
             
     | 
| 
      
 43 
     | 
    
         
            +
                      #te = [e[1].hour, e[1].min, e[1].sec] if e[1]
         
     | 
| 
      
 44 
     | 
    
         
            +
             
     | 
| 
      
 45 
     | 
    
         
            +
                      entity.set :start_time, ::DateTime.civil(*ds) if ds
         
     | 
| 
      
 46 
     | 
    
         
            +
                      entity.set :end_time, ::DateTime.civil(*de) if de
         
     | 
| 
      
 47 
     | 
    
         
            +
             
     | 
| 
       39 
48 
     | 
    
         
             
                      entity.start_time
         
     | 
| 
       40 
49 
     | 
    
         
             
                    end
         
     | 
| 
       41 
50 
     | 
    
         
             
                  end
         
     | 
| 
       42 
51 
     | 
    
         
             
                end
         
     | 
| 
       43 
52 
     | 
    
         
             
              end
         
     | 
| 
       44 
53 
     | 
    
         
             
            end
         
     | 
| 
       45 
     | 
    
         
            -
             
     | 
| 
         @@ -9,6 +9,8 @@ module Treat 
     | 
|
| 
       9 
9 
     | 
    
         
             
                  # Blei, David M., Ng, Andrew Y., and Jordan, Michael
         
     | 
| 
       10 
10 
     | 
    
         
             
                  # I. 2003. Latent dirichlet allocation. Journal of
         
     | 
| 
       11 
11 
     | 
    
         
             
                  # Machine Learning Research. 3 (Mar. 2003), 993-1022.
         
     | 
| 
      
 12 
     | 
    
         
            +
                  #
         
     | 
| 
      
 13 
     | 
    
         
            +
                  # Project website: https://github.com/ealdent/lda-ruby
         
     | 
| 
       12 
14 
     | 
    
         
             
                  class LDA
         
     | 
| 
       13 
15 
     | 
    
         
             
                    # Require the lda-ruby gem.
         
     | 
| 
       14 
16 
     | 
    
         
             
                    silence_warnings { require 'lda-ruby' }
         
     | 
| 
         @@ -17,25 +19,28 @@ module Treat 
     | 
|
| 
       17 
19 
     | 
    
         
             
                    Lda::TextCorpus.class_eval do
         
     | 
| 
       18 
20 
     | 
    
         
             
                      # Ruby, Y U NO SHUT UP!
         
     | 
| 
       19 
21 
     | 
    
         
             
                      silence_warnings { undef :initialize }
         
     | 
| 
       20 
     | 
    
         
            -
                      # Redefine initialize to take in an array of  
     | 
| 
       21 
     | 
    
         
            -
                      def initialize( 
     | 
| 
      
 22 
     | 
    
         
            +
                      # Redefine initialize to take in an array of sections
         
     | 
| 
      
 23 
     | 
    
         
            +
                      def initialize(sections)
         
     | 
| 
       22 
24 
     | 
    
         
             
                        super(nil)
         
     | 
| 
       23 
     | 
    
         
            -
                         
     | 
| 
       24 
     | 
    
         
            -
                          add_document(Lda::TextDocument.new(self,  
     | 
| 
      
 25 
     | 
    
         
            +
                        sections.each do |section|
         
     | 
| 
      
 26 
     | 
    
         
            +
                          add_document(Lda::TextDocument.new(self, section))
         
     | 
| 
       25 
27 
     | 
    
         
             
                        end
         
     | 
| 
       26 
28 
     | 
    
         
             
                      end
         
     | 
| 
       27 
29 
     | 
    
         
             
                    end
         
     | 
| 
      
 30 
     | 
    
         
            +
                    # Default options for the LDA algorithm.
         
     | 
| 
      
 31 
     | 
    
         
            +
                    DefaultOptions = {
         
     | 
| 
      
 32 
     | 
    
         
            +
                      topics: 20,
         
     | 
| 
      
 33 
     | 
    
         
            +
                      words_per_topic: 10,
         
     | 
| 
      
 34 
     | 
    
         
            +
                      iterations: 20
         
     | 
| 
      
 35 
     | 
    
         
            +
                    }
         
     | 
| 
      
 36 
     | 
    
         
            +
                    # Retrieve the topic words of a collection.
         
     | 
| 
       28 
37 
     | 
    
         
             
                    def self.topic_words(collection, options = {})
         
     | 
| 
       29 
     | 
    
         
            -
                       
     | 
| 
       30 
     | 
    
         
            -
                      options[:words_per_topic] ||= 10
         
     | 
| 
       31 
     | 
    
         
            -
                      options[:topics] ||= 20
         
     | 
| 
       32 
     | 
    
         
            -
                      options[:iterations] ||= 20
         
     | 
| 
       33 
     | 
    
         
            -
             
     | 
| 
      
 38 
     | 
    
         
            +
                      options = DefaultOptions.merge(options)
         
     | 
| 
       34 
39 
     | 
    
         
             
                      # Create a corpus with the collection
         
     | 
| 
       35 
     | 
    
         
            -
                       
     | 
| 
      
 40 
     | 
    
         
            +
                      sections = collection.sections.collect do |t|
         
     | 
| 
       36 
41 
     | 
    
         
             
                        t.to_s.encode("UTF-8", :invalid => :replace, :undef => :replace, :replace => "?")
         
     | 
| 
       37 
42 
     | 
    
         
             
                      end
         
     | 
| 
       38 
     | 
    
         
            -
                      corpus = Lda::TextCorpus.new( 
     | 
| 
      
 43 
     | 
    
         
            +
                      corpus = Lda::TextCorpus.new(sections)
         
     | 
| 
       39 
44 
     | 
    
         | 
| 
       40 
45 
     | 
    
         
             
                      # Create an Lda object for training
         
     | 
| 
       41 
46 
     | 
    
         
             
                      lda = Lda::Lda.new(corpus)
         
     | 
| 
         @@ -43,15 +48,15 @@ module Treat 
     | 
|
| 
       43 
48 
     | 
    
         
             
                      lda.max_iter = options[:iterations]
         
     | 
| 
       44 
49 
     | 
    
         
             
                      # Run the EM algorithm using random starting points
         
     | 
| 
       45 
50 
     | 
    
         
             
                      silence_streams(STDOUT, STDERR) { lda.em('random') }
         
     | 
| 
       46 
     | 
    
         
            -
             
     | 
| 
      
 51 
     | 
    
         
            +
             
     | 
| 
       47 
52 
     | 
    
         
             
                      # Load the vocabulary.
         
     | 
| 
       48 
53 
     | 
    
         
             
                      if options[:vocabulary]
         
     | 
| 
       49 
54 
     | 
    
         
             
                        lda.load_vocabulary(options[:vocabulary])
         
     | 
| 
       50 
55 
     | 
    
         
             
                      end
         
     | 
| 
       51 
     | 
    
         
            -
             
     | 
| 
       52 
     | 
    
         
            -
                      # Get the topic words and annotate the  
     | 
| 
      
 56 
     | 
    
         
            +
             
     | 
| 
      
 57 
     | 
    
         
            +
                      # Get the topic words and annotate the section.
         
     | 
| 
       53 
58 
     | 
    
         
             
                      topic_words = lda.top_words(options[:words_per_topic])
         
     | 
| 
       54 
     | 
    
         
            -
             
     | 
| 
      
 59 
     | 
    
         
            +
             
     | 
| 
       55 
60 
     | 
    
         
             
                      topic_words.each do |i, words|
         
     | 
| 
       56 
61 
     | 
    
         
             
                        collection.each_word do |word|
         
     | 
| 
       57 
62 
     | 
    
         
             
                          if words.include?(word)
         
     | 
| 
         @@ -62,7 +67,7 @@ module Treat 
     | 
|
| 
       62 
67 
     | 
    
         
             
                          end
         
     | 
| 
       63 
68 
     | 
    
         
             
                        end
         
     | 
| 
       64 
69 
     | 
    
         
             
                      end
         
     | 
| 
       65 
     | 
    
         
            -
             
     | 
| 
      
 70 
     | 
    
         
            +
             
     | 
| 
       66 
71 
     | 
    
         
             
                      topic_words
         
     | 
| 
       67 
72 
     | 
    
         
             
                    end
         
     | 
| 
       68 
73 
     | 
    
         
             
                  end
         
     | 
| 
         @@ -6,9 +6,9 @@ module Treat 
     | 
|
| 
       6 
6 
     | 
    
         
             
                  #
         
     | 
| 
       7 
7 
     | 
    
         
             
                  # Copyright 2005 Mark Watson.  All rights reserved.
         
     | 
| 
       8 
8 
     | 
    
         
             
                  # This software is released under the GPL.
         
     | 
| 
       9 
     | 
    
         
            -
                  #
         
     | 
| 
       10 
     | 
    
         
            -
                  #  
     | 
| 
       11 
     | 
    
         
            -
                  #  
     | 
| 
      
 9 
     | 
    
         
            +
                  # Rewrite for inclusion in Treat by Louis Mullie (2011).
         
     | 
| 
      
 10 
     | 
    
         
            +
                  # 
         
     | 
| 
      
 11 
     | 
    
         
            +
                  # Original project website: http://www.markwatson.com/opensource/
         
     | 
| 
       12 
12 
     | 
    
         
             
                  class Reuters
         
     | 
| 
       13 
13 
     | 
    
         
             
                    # Require the Nokogiri XML parser.
         
     | 
| 
       14 
14 
     | 
    
         
             
                    require 'nokogiri'
         
     | 
| 
         @@ -17,6 +17,8 @@ module Treat 
     | 
|
| 
       17 
17 
     | 
    
         
             
                    @@region = {}
         
     | 
| 
       18 
18 
     | 
    
         
             
                    @@topics = {}
         
     | 
| 
       19 
19 
     | 
    
         
             
                    # Get the topic of the text.
         
     | 
| 
      
 20 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 21 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       20 
22 
     | 
    
         
             
                    def self.topics(text, options = {})
         
     | 
| 
       21 
23 
     | 
    
         
             
                      stems = []
         
     | 
| 
       22 
24 
     | 
    
         
             
                      @@reduce = 0
         
     | 
| 
         @@ -33,7 +35,7 @@ module Treat 
     | 
|
| 
       33 
35 
     | 
    
         
             
                      topics = score_words(@@industry, stems)
         
     | 
| 
       34 
36 
     | 
    
         
             
                      topics = topics.merge(score_words(@@region, stems))
         
     | 
| 
       35 
37 
     | 
    
         
             
                      topics = topics.merge(score_words(@@topics, stems))
         
     | 
| 
       36 
     | 
    
         
            -
                      Treat::Feature.new(topics)
         
     | 
| 
      
 38 
     | 
    
         
            +
                      #Treat::Feature.new(topics)
         
     | 
| 
       37 
39 
     | 
    
         
             
                    end
         
     | 
| 
       38 
40 
     | 
    
         
             
                    # Read the topics from the XML files.
         
     | 
| 
       39 
41 
     | 
    
         
             
                    def self.get_topics
         
     | 
    
        data/lib/treat/extractors.rb
    CHANGED
    
    | 
         @@ -6,19 +6,19 @@ module Treat 
     | 
|
| 
       6 
6 
     | 
    
         
             
                module Time
         
     | 
| 
       7 
7 
     | 
    
         
             
                  extend Group
         
     | 
| 
       8 
8 
     | 
    
         
             
                  self.type = :annotator
         
     | 
| 
       9 
     | 
    
         
            -
                  self.targets = [:word, :constituent, :symbol]
         
     | 
| 
      
 9 
     | 
    
         
            +
                  self.targets = [:sentence, :word, :constituent, :symbol]
         
     | 
| 
       10 
10 
     | 
    
         
             
                end
         
     | 
| 
       11 
11 
     | 
    
         
             
                # Extract the topic from a text.
         
     | 
| 
       12 
12 
     | 
    
         
             
                module Topics
         
     | 
| 
       13 
13 
     | 
    
         
             
                  extend Group
         
     | 
| 
       14 
14 
     | 
    
         
             
                  self.type = :annotator
         
     | 
| 
       15 
     | 
    
         
            -
                  self.targets = [:collection, :document, : 
     | 
| 
      
 15 
     | 
    
         
            +
                  self.targets = [:collection, :document, :zone, :sentence]
         
     | 
| 
       16 
16 
     | 
    
         
             
                end
         
     | 
| 
       17 
17 
     | 
    
         
             
                # Extract the topic from a text.
         
     | 
| 
       18 
18 
     | 
    
         
             
                module TopicWords
         
     | 
| 
       19 
19 
     | 
    
         
             
                  extend Group
         
     | 
| 
       20 
20 
     | 
    
         
             
                  self.type = :annotator
         
     | 
| 
       21 
     | 
    
         
            -
                  self.targets = [:collection, :document, : 
     | 
| 
      
 21 
     | 
    
         
            +
                  self.targets = [:collection, :document, :zone, :sentence]
         
     | 
| 
       22 
22 
     | 
    
         
             
                end
         
     | 
| 
       23 
23 
     | 
    
         
             
                # Extract named entities from texts.
         
     | 
| 
       24 
24 
     | 
    
         
             
                module NamedEntity
         
     | 
| 
         @@ -27,15 +27,15 @@ module Treat 
     | 
|
| 
       27 
27 
     | 
    
         
             
                  self.targets = [:entity]
         
     | 
| 
       28 
28 
     | 
    
         
             
                end
         
     | 
| 
       29 
29 
     | 
    
         
             
                # Extract the key sentences from a text.
         
     | 
| 
       30 
     | 
    
         
            -
                module  
     | 
| 
      
 30 
     | 
    
         
            +
                module Keywords
         
     | 
| 
       31 
31 
     | 
    
         
             
                  extend Group
         
     | 
| 
       32 
     | 
    
         
            -
                  self.type = : 
     | 
| 
       33 
     | 
    
         
            -
                  self.targets = [:collection, :document, : 
     | 
| 
      
 32 
     | 
    
         
            +
                  self.type = :annotator
         
     | 
| 
      
 33 
     | 
    
         
            +
                  self.targets = [:collection, :document, :zone, :sentence]
         
     | 
| 
       34 
34 
     | 
    
         
             
                end
         
     | 
| 
       35 
35 
     | 
    
         
             
                # This module should be moved out of here ASAP.
         
     | 
| 
       36 
36 
     | 
    
         
             
                module Statistics
         
     | 
| 
       37 
37 
     | 
    
         
             
                  extend Group
         
     | 
| 
       38 
     | 
    
         
            -
                  self.type = : 
     | 
| 
      
 38 
     | 
    
         
            +
                  self.type = :annotator
         
     | 
| 
       39 
39 
     | 
    
         
             
                  self.targets = [:entity]
         
     | 
| 
       40 
40 
     | 
    
         
             
                  self.default = :none
         
     | 
| 
       41 
41 
     | 
    
         
             
                end
         
     | 
| 
         @@ -0,0 +1,32 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Treat
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Formatters
         
     | 
| 
      
 3 
     | 
    
         
            +
                module Readers
         
     | 
| 
      
 4 
     | 
    
         
            +
                  class Abw
         
     | 
| 
      
 5 
     | 
    
         
            +
                    require 'rexml/document'
         
     | 
| 
      
 6 
     | 
    
         
            +
                    require 'rexml/streamlistener'
         
     | 
| 
      
 7 
     | 
    
         
            +
                    def self.read(document, options = {})
         
     | 
| 
      
 8 
     | 
    
         
            +
                      xml_h = AbiWordXmlHandler.new(
         
     | 
| 
      
 9 
     | 
    
         
            +
                      REXML::Document.parse_stream((IO.read(document.file)), xml_h))
         
     | 
| 
      
 10 
     | 
    
         
            +
                      document << xml_h.plain_text
         
     | 
| 
      
 11 
     | 
    
         
            +
                      document
         
     | 
| 
      
 12 
     | 
    
         
            +
                    end
         
     | 
| 
      
 13 
     | 
    
         
            +
                    class AbiWordXmlHandler
         
     | 
| 
      
 14 
     | 
    
         
            +
                      include REXML::StreamListener
         
     | 
| 
      
 15 
     | 
    
         
            +
                      attr_reader :plain_text
         
     | 
| 
      
 16 
     | 
    
         
            +
                      def initialize
         
     | 
| 
      
 17 
     | 
    
         
            +
                        @plain_text = ""
         
     | 
| 
      
 18 
     | 
    
         
            +
                      end
         
     | 
| 
      
 19 
     | 
    
         
            +
                      def text s
         
     | 
| 
      
 20 
     | 
    
         
            +
                        begin
         
     | 
| 
      
 21 
     | 
    
         
            +
                          s = s.strip
         
     | 
| 
      
 22 
     | 
    
         
            +
                          if s.length > 0
         
     | 
| 
      
 23 
     | 
    
         
            +
                            @plain_text << s
         
     | 
| 
      
 24 
     | 
    
         
            +
                            @plain_text << "\n"
         
     | 
| 
      
 25 
     | 
    
         
            +
                          end
         
     | 
| 
      
 26 
     | 
    
         
            +
                        end if s != 'AbiWord' && s != 'application/x-abiword'
         
     | 
| 
      
 27 
     | 
    
         
            +
                      end
         
     | 
| 
      
 28 
     | 
    
         
            +
                    end
         
     | 
| 
      
 29 
     | 
    
         
            +
                  end
         
     | 
| 
      
 30 
     | 
    
         
            +
                end
         
     | 
| 
      
 31 
     | 
    
         
            +
              end
         
     | 
| 
      
 32 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -6,26 +6,28 @@ module Treat 
     | 
|
| 
       6 
6 
     | 
    
         
             
                  # the appropriate reader based on the file
         
     | 
| 
       7 
7 
     | 
    
         
             
                  # extension of the supplied document.
         
     | 
| 
       8 
8 
     | 
    
         
             
                  class Autoselect
         
     | 
| 
       9 
     | 
    
         
            -
                    # A list of image extensions that should be routed
         
     | 
| 
       10 
     | 
    
         
            -
                    # to the Ocropus OCR engine.
         
     | 
| 
      
 9 
     | 
    
         
            +
                    # A list of image extensions that should be routed to OCR.
         
     | 
| 
       11 
10 
     | 
    
         
             
                    ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
         
     | 
| 
      
 11 
     | 
    
         
            +
                    # Default options.
         
     | 
| 
      
 12 
     | 
    
         
            +
                    DefaultOptions = {:ocr => :ocropus}
         
     | 
| 
       12 
13 
     | 
    
         
             
                    # Select the appropriate reader based on the format
         
     | 
| 
       13 
14 
     | 
    
         
             
                    # of the filename in document.
         
     | 
| 
       14 
15 
     | 
    
         
             
                    # 
         
     | 
| 
       15 
16 
     | 
    
         
             
                    # Options:
         
     | 
| 
       16 
     | 
    
         
            -
                    # 
     | 
| 
       17 
     | 
    
         
            -
                     
     | 
| 
      
 17 
     | 
    
         
            +
                    #
         
     | 
| 
      
 18 
     | 
    
         
            +
                    # - :ocr_engine => :ocropus or :gocr (the OCR engine to use).
         
     | 
| 
      
 19 
     | 
    
         
            +
                    def self.read(document, options)
         
     | 
| 
      
 20 
     | 
    
         
            +
                      options = DefaultOptions.merge(options)
         
     | 
| 
       18 
21 
     | 
    
         
             
                      ext = document.file.split('.')[-1]
         
     | 
| 
       19 
     | 
    
         
            -
                       
     | 
| 
       20 
     | 
    
         
            -
             
     | 
| 
       21 
     | 
    
         
            -
                       
     | 
| 
       22 
     | 
    
         
            -
                        reader = ext
         
     | 
| 
       23 
     | 
    
         
            -
                      end
         
     | 
| 
      
 22 
     | 
    
         
            +
                      reader = ImageExtensions.include?(ext) ? 'ocropus' : ext
         
     | 
| 
      
 23 
     | 
    
         
            +
                      reader = 'html' if reader == 'htm'
         
     | 
| 
      
 24 
     | 
    
         
            +
                      reader = 'yaml' if reader == 'yml'
         
     | 
| 
       24 
25 
     | 
    
         
             
                      begin
         
     | 
| 
       25 
26 
     | 
    
         
             
                        r = Treat::Formatters::Readers.const_get(cc(reader))
         
     | 
| 
       26 
     | 
    
         
            -
                      rescue NameError
         
     | 
| 
      
 27 
     | 
    
         
            +
                      rescue NameError => e
         
     | 
| 
      
 28 
     | 
    
         
            +
                        puts e.message
         
     | 
| 
       27 
29 
     | 
    
         
             
                        raise Treat::Exception,
         
     | 
| 
       28 
     | 
    
         
            -
                        "Cannot find a  
     | 
| 
      
 30 
     | 
    
         
            +
                        "Cannot find a reader for format: '#{ext}'."
         
     | 
| 
       29 
31 
     | 
    
         
             
                      end
         
     | 
| 
       30 
32 
     | 
    
         
             
                      document = r.read(document, options)
         
     | 
| 
       31 
33 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -1,11 +1,31 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Formatters
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Readers
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # A temporary HTML reader; simply strips the
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # document of all of its markup.
         
     | 
| 
       4 
6 
     | 
    
         
             
                  class HTML
         
     | 
| 
      
 7 
     | 
    
         
            +
                    # Require Hpricot.
         
     | 
| 
      
 8 
     | 
    
         
            +
                    silence_warnings { require 'hpricot' }
         
     | 
| 
      
 9 
     | 
    
         
            +
                    # By default, backup the HTML text while cleaning.
         
     | 
| 
      
 10 
     | 
    
         
            +
                    DefaultOptions = { clean: true, backup: false }
         
     | 
| 
      
 11 
     | 
    
         
            +
                    # Read the HTML document and strip it of its markup.
         
     | 
| 
      
 12 
     | 
    
         
            +
                    #
         
     | 
| 
      
 13 
     | 
    
         
            +
                    # Options:
         
     | 
| 
      
 14 
     | 
    
         
            +
                    #
         
     | 
| 
      
 15 
     | 
    
         
            +
                    # - (Boolean) :clean => whether to strip HTML markup.
         
     | 
| 
      
 16 
     | 
    
         
            +
                    # - (Boolean) :backup => whether to backup the HTML 
         
     | 
| 
      
 17 
     | 
    
         
            +
                    #   markup while cleaning.
         
     | 
| 
       5 
18 
     | 
    
         
             
                    def self.read(document, options = {})
         
     | 
| 
      
 19 
     | 
    
         
            +
                      options = DefaultOptions.merge(options)
         
     | 
| 
       6 
20 
     | 
    
         
             
                      f = File.read(document.file)
         
     | 
| 
       7 
21 
     | 
    
         
             
                      document << Treat::Entities::Entity.from_string(f)
         
     | 
| 
       8 
     | 
    
         
            -
                       
     | 
| 
      
 22 
     | 
    
         
            +
                      if options[:clean]
         
     | 
| 
      
 23 
     | 
    
         
            +
                        document.each do |section|
         
     | 
| 
      
 24 
     | 
    
         
            +
                          section.set :html_value, section.value if options[:backup]
         
     | 
| 
      
 25 
     | 
    
         
            +
                          section.value = Hpricot(section.value).inner_text
         
     | 
| 
      
 26 
     | 
    
         
            +
                        end
         
     | 
| 
      
 27 
     | 
    
         
            +
                      end
         
     | 
| 
      
 28 
     | 
    
         
            +
                      document
         
     | 
| 
       9 
29 
     | 
    
         
             
                    end
         
     | 
| 
       10 
30 
     | 
    
         
             
                  end
         
     | 
| 
       11 
31 
     | 
    
         
             
                end
         
     | 
| 
         @@ -15,11 +15,11 @@ module Treat 
     | 
|
| 
       15 
15 
     | 
    
         
             
                  # DFKI and U. Kaiserslautern, Germany.
         
     | 
| 
       16 
16 
     | 
    
         
             
                  class Ocropus
         
     | 
| 
       17 
17 
     | 
    
         
             
                    #  Read a file using the Google Ocropus reader.
         
     | 
| 
      
 18 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 19 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       18 
20 
     | 
    
         
             
                    def self.read(document, options = {})
         
     | 
| 
       19 
21 
     | 
    
         
             
                      create_temp_file(:txt) do |tmp|
         
     | 
| 
       20 
     | 
    
         
            -
                         
     | 
| 
       21 
     | 
    
         
            -
                          `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
         
     | 
| 
       22 
     | 
    
         
            -
                        end
         
     | 
| 
      
 22 
     | 
    
         
            +
                        `ocropus page #{document.file} > #{tmp} -STDIO 2>/dev/null`
         
     | 
| 
       23 
23 
     | 
    
         
             
                        f = File.read(tmp)
         
     | 
| 
       24 
24 
     | 
    
         
             
                        document << Treat::Entities::Entity.from_string(f)
         
     | 
| 
       25 
25 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -0,0 +1,41 @@ 
     | 
|
| 
      
 1 
     | 
    
         
            +
            module Treat
         
     | 
| 
      
 2 
     | 
    
         
            +
              module Formatters
         
     | 
| 
      
 3 
     | 
    
         
            +
                module Readers
         
     | 
| 
      
 4 
     | 
    
         
            +
                  class Odt
         
     | 
| 
      
 5 
     | 
    
         
            +
                    # Build an entity from a string in plain text format.
         
     | 
| 
      
 6 
     | 
    
         
            +
                    def self.read(document, options = {})
         
     | 
| 
      
 7 
     | 
    
         
            +
                      f = File.read(document.file)
         
     | 
| 
      
 8 
     | 
    
         
            +
                      f = f.force_encoding("UTF-8")
         
     | 
| 
      
 9 
     | 
    
         
            +
                      xml_h = OOXmlHandler.new(
         
     | 
| 
      
 10 
     | 
    
         
            +
                        REXML::Document.parse_stream(f, xml_h)
         
     | 
| 
      
 11 
     | 
    
         
            +
                      )
         
     | 
| 
      
 12 
     | 
    
         
            +
                      document << xml_h.plain_text
         
     | 
| 
      
 13 
     | 
    
         
            +
                      document
         
     | 
| 
      
 14 
     | 
    
         
            +
                    end
         
     | 
| 
      
 15 
     | 
    
         
            +
             
     | 
| 
      
 16 
     | 
    
         
            +
                    class OOXmlHandler
         
     | 
| 
      
 17 
     | 
    
         
            +
                      require 'rexml/document'
         
     | 
| 
      
 18 
     | 
    
         
            +
                      require 'rexml/streamlistener'
         
     | 
| 
      
 19 
     | 
    
         
            +
                      include REXML::StreamListener
         
     | 
| 
      
 20 
     | 
    
         
            +
                      attr_reader :plain_text
         
     | 
| 
      
 21 
     | 
    
         
            +
                      def initialize
         
     | 
| 
      
 22 
     | 
    
         
            +
                        @plain_text = ""
         
     | 
| 
      
 23 
     | 
    
         
            +
                      end
         
     | 
| 
      
 24 
     | 
    
         
            +
                      def tag_start(name, attrs)
         
     | 
| 
      
 25 
     | 
    
         
            +
                        @last_name = name
         
     | 
| 
      
 26 
     | 
    
         
            +
                      end
         
     | 
| 
      
 27 
     | 
    
         
            +
                      def text(s)
         
     | 
| 
      
 28 
     | 
    
         
            +
                        if @last_name.index('text')
         
     | 
| 
      
 29 
     | 
    
         
            +
                          s = s.strip
         
     | 
| 
      
 30 
     | 
    
         
            +
                          if s.length > 0
         
     | 
| 
      
 31 
     | 
    
         
            +
                            @plain_text << s
         
     | 
| 
      
 32 
     | 
    
         
            +
                            @plain_text << "\n"
         
     | 
| 
      
 33 
     | 
    
         
            +
                          end
         
     | 
| 
      
 34 
     | 
    
         
            +
                        end
         
     | 
| 
      
 35 
     | 
    
         
            +
                      end
         
     | 
| 
      
 36 
     | 
    
         
            +
                    end
         
     | 
| 
      
 37 
     | 
    
         
            +
                  end
         
     | 
| 
      
 38 
     | 
    
         
            +
                  
         
     | 
| 
      
 39 
     | 
    
         
            +
                end
         
     | 
| 
      
 40 
     | 
    
         
            +
              end
         
     | 
| 
      
 41 
     | 
    
         
            +
            end
         
     | 
| 
         @@ -1,9 +1,12 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Formatters
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Readers
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # A wrapper for the Poppler pdf2text utility, which
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # extracts the text from a PDF file.
         
     | 
| 
       4 
6 
     | 
    
         
             
                  class PDF
         
     | 
| 
       5 
     | 
    
         
            -
                     
     | 
| 
       6 
     | 
    
         
            -
                    #  
     | 
| 
      
 7 
     | 
    
         
            +
                    # Read a PDF file using the Poppler pdf2text utility.
         
     | 
| 
      
 8 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 9 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       7 
10 
     | 
    
         
             
                    def self.read(document, options = {})
         
     | 
| 
       8 
11 
     | 
    
         
             
                      create_temp_file(:txt) do |tmp|
         
     | 
| 
       9 
12 
     | 
    
         
             
                        `pdftotext #{document.file} #{tmp} `.strip
         
     | 
| 
         @@ -4,6 +4,8 @@ module Treat 
     | 
|
| 
       4 
4 
     | 
    
         
             
                  # This class simply reads a plain text file.
         
     | 
| 
       5 
5 
     | 
    
         
             
                  class Txt
         
     | 
| 
       6 
6 
     | 
    
         
             
                    # Build an entity from a string in plain text format.
         
     | 
| 
      
 7 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 8 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       7 
9 
     | 
    
         
             
                    def self.read(document, options = {})
         
     | 
| 
       8 
10 
     | 
    
         
             
                      f = File.read(document.file)
         
     | 
| 
       9 
11 
     | 
    
         
             
                      document << Treat::Entities::Entity.from_string(f)
         
     | 
| 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Formatters
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Serializers
         
     | 
| 
       4 
     | 
    
         
            -
                  # This class converts an entity to XML format.
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # This class converts an entity to a storable XML format.
         
     | 
| 
       5 
5 
     | 
    
         
             
                  class XML
         
     | 
| 
       6 
6 
     | 
    
         
             
                    # Reauire the Nokogiri XML parser.
         
     | 
| 
       7 
7 
     | 
    
         
             
                    require 'nokogiri'
         
     | 
| 
         @@ -9,7 +9,8 @@ module Treat 
     | 
|
| 
       9 
9 
     | 
    
         
             
                    def self.serialize(entity, options = {})
         
     | 
| 
       10 
10 
     | 
    
         
             
                      options = {:indent => 0} if options[:indent].nil?
         
     | 
| 
       11 
11 
     | 
    
         
             
                      if options[:indent] == 0
         
     | 
| 
       12 
     | 
    
         
            -
                         
     | 
| 
      
 12 
     | 
    
         
            +
                        enc = entity.encoding(:r_chardet19).to_s.gsub('_', '-').upcase
         
     | 
| 
      
 13 
     | 
    
         
            +
                        string = "<?xml version=\"1.0\" encoding=\"#{enc}\" standalone=\"no\" ?>"
         
     | 
| 
       13 
14 
     | 
    
         
             
                      else
         
     | 
| 
       14 
15 
     | 
    
         
             
                        string = ''
         
     | 
| 
       15 
16 
     | 
    
         
             
                      end
         
     | 
| 
         @@ -1,7 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Formatters
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Unserializers
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # This class doesn't perform any unserializing;
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # it simply routes the document to an unserializer
         
     | 
| 
      
 6 
     | 
    
         
            +
                  # based on the file extension of the document.
         
     | 
| 
       4 
7 
     | 
    
         
             
                  class Autoselect
         
     | 
| 
      
 8 
     | 
    
         
            +
                    # Unserialize any supported file format.
         
     | 
| 
      
 9 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 10 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       5 
11 
     | 
    
         
             
                    def self.unserialize(document, options = {})
         
     | 
| 
       6 
12 
     | 
    
         
             
                      ext = document.file.split('.')[-1]
         
     | 
| 
       7 
13 
     | 
    
         
             
                      if ext == 'yaml' || ext == 'yml'
         
     | 
| 
         @@ -9,7 +15,7 @@ module Treat 
     | 
|
| 
       9 
15 
     | 
    
         
             
                      elsif ext == 'xml'
         
     | 
| 
       10 
16 
     | 
    
         
             
                        document.unserialize(:xml)
         
     | 
| 
       11 
17 
     | 
    
         
             
                      else
         
     | 
| 
       12 
     | 
    
         
            -
                        raise "File #{document.file} was not recognized"+
         
     | 
| 
      
 18 
     | 
    
         
            +
                        raise "File #{document.file} was not recognized "+
         
     | 
| 
       13 
19 
     | 
    
         
             
                        "as a supported serialized format."
         
     | 
| 
       14 
20 
     | 
    
         
             
                      end
         
     | 
| 
       15 
21 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -1,9 +1,13 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Formatters
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Unserializers
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # Recreates the entity tree corresponding to 
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # a serialized XML file.
         
     | 
| 
       4 
6 
     | 
    
         
             
                  class XML
         
     | 
| 
       5 
7 
     | 
    
         
             
                    require 'nokogiri'
         
     | 
| 
       6 
     | 
    
         
            -
                    
         
     | 
| 
      
 8 
     | 
    
         
            +
                    # Unserialize an entity stored in XML format.
         
     | 
| 
      
 9 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 10 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       7 
11 
     | 
    
         
             
                    def self.unserialize(document, options = {})
         
     | 
| 
       8 
12 
     | 
    
         
             
                      # Read in the XML file.
         
     | 
| 
       9 
13 
     | 
    
         
             
                      xml = File.read(document.file)
         
     | 
| 
         @@ -59,6 +63,7 @@ module Treat 
     | 
|
| 
       59 
63 
     | 
    
         
             
                          current_value = xml_reader.value.strip
         
     | 
| 
       60 
64 
     | 
    
         
             
                          if current_value && current_value != ''
         
     | 
| 
       61 
65 
     | 
    
         
             
                            current_element.value = current_value
         
     | 
| 
      
 66 
     | 
    
         
            +
                            current_element.register_token(current_element)
         
     | 
| 
       62 
67 
     | 
    
         
             
                          end
         
     | 
| 
       63 
68 
     | 
    
         
             
                        end
         
     | 
| 
       64 
69 
     | 
    
         | 
| 
         @@ -1,10 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            module Treat
         
     | 
| 
       2 
2 
     | 
    
         
             
              module Formatters
         
     | 
| 
       3 
3 
     | 
    
         
             
                module Unserializers
         
     | 
| 
      
 4 
     | 
    
         
            +
                  # This class is a wrapper for the Psych YAML
         
     | 
| 
      
 5 
     | 
    
         
            +
                  # parser; it unserializes YAML files.
         
     | 
| 
       4 
6 
     | 
    
         
             
                  class YAML
         
     | 
| 
       5 
7 
     | 
    
         
             
                    # Require the Psych YAML parser.
         
     | 
| 
       6 
8 
     | 
    
         
             
                    require 'psych'
         
     | 
| 
       7 
     | 
    
         
            -
                    # Unserialize a YAML file 
     | 
| 
      
 9 
     | 
    
         
            +
                    # Unserialize a YAML file.
         
     | 
| 
      
 10 
     | 
    
         
            +
                    # 
         
     | 
| 
      
 11 
     | 
    
         
            +
                    # Options: none.
         
     | 
| 
       8 
12 
     | 
    
         
             
                    def self.unserialize(document, options = {})
         
     | 
| 
       9 
13 
     | 
    
         
             
                      document << ::Psych.load(File.read(document.file))
         
     | 
| 
       10 
14 
     | 
    
         
             
                      document
         
     |