RubyGems - treat - Versions diffs - 0.1.2 → 0.1.3 - Mend

treat 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (100) hide show

data/LICENSE +7 -8
data/TODO +16 -13
data/examples/keywords.rb +89 -1
data/lib/treat/buildable.rb +1 -8
data/lib/treat/categories.rb +3 -4
data/lib/treat/category.rb +1 -1
data/lib/treat/delegatable.rb +1 -1
data/lib/treat/detectors/encoding/native.rb +5 -0
data/lib/treat/detectors/encoding/r_chardet19.rb +2 -3
data/lib/treat/detectors/language/language_detector.rb +4 -0
data/lib/treat/detectors/language/what_language.rb +4 -4
data/lib/treat/detectors.rb +1 -1
data/lib/treat/entities/entity.rb +5 -3
data/lib/treat/entities/tokens.rb +14 -5
data/lib/treat/entities/zones.rb +4 -0
data/lib/treat/entities.rb +7 -5
data/lib/treat/extractors/keywords/topics_frequency.rb +40 -0
data/lib/treat/extractors/statistics/{frequency.rb → frequency_in.rb} +5 -4
data/lib/treat/extractors/statistics/frequency_of.rb +3 -5
data/lib/treat/extractors/statistics/{position_in.rb → position_in_parent.rb} +4 -3
data/lib/treat/extractors/statistics/tf_idf.rb +36 -0
data/lib/treat/extractors/statistics/transition_matrix.rb +25 -25
data/lib/treat/extractors/statistics/transition_probability.rb +8 -4
data/lib/treat/extractors/time/chronic.rb +8 -0
data/lib/treat/extractors/time/native.rb +6 -0
data/lib/treat/extractors/time/nickel.rb +31 -23
data/lib/treat/extractors/topic_words/lda.rb +21 -16
data/lib/treat/extractors/topics/reuters.rb +6 -4
data/lib/treat/extractors.rb +7 -7
data/lib/treat/formatters/readers/abw.rb +32 -0
data/lib/treat/formatters/readers/autoselect.rb +13 -11
data/lib/treat/formatters/readers/doc.rb +13 -0
data/lib/treat/formatters/readers/gocr.rb +2 -0
data/lib/treat/formatters/readers/html.rb +21 -1
data/lib/treat/formatters/readers/ocropus.rb +3 -3
data/lib/treat/formatters/readers/odt.rb +41 -0
data/lib/treat/formatters/readers/pdf.rb +5 -2
data/lib/treat/formatters/readers/txt.rb +2 -0
data/lib/treat/formatters/serializers/xml.rb +3 -2
data/lib/treat/formatters/serializers/yaml.rb +2 -0
data/lib/treat/formatters/unserializers/autoselect.rb +7 -1
data/lib/treat/formatters/unserializers/xml.rb +6 -1
data/lib/treat/formatters/unserializers/yaml.rb +5 -1
data/lib/treat/formatters/visualizers/dot.rb +35 -37
data/lib/treat/formatters/visualizers/html.rb +1 -0
data/lib/treat/formatters/visualizers/inspect.rb +4 -0
data/lib/treat/formatters/visualizers/short_value.rb +18 -3
data/lib/treat/formatters/visualizers/standoff.rb +11 -6
data/lib/treat/formatters/visualizers/tree.rb +5 -1
data/lib/treat/formatters/visualizers/txt.rb +6 -1
data/lib/treat/formatters.rb +1 -1
data/lib/treat/group.rb +4 -3
data/lib/treat/inflectors/cardinal_words/linguistics.rb +14 -17
data/lib/treat/inflectors/conjugations/linguistics.rb +16 -3
data/lib/treat/inflectors/declensions/linguistics.rb +17 -6
data/lib/treat/inflectors/ordinal_words/linguistics.rb +9 -10
data/lib/treat/inflectors/stem/porter.rb +6 -2
data/lib/treat/inflectors/stem/porter_c.rb +4 -1
data/lib/treat/inflectors/stem/uea.rb +4 -4
data/lib/treat/languages/english/tags.rb +16 -0
data/lib/treat/languages/english.rb +4 -1
data/lib/treat/lexicalizers/category/from_tag.rb +4 -4
data/lib/treat/lexicalizers/linkages/naive.rb +3 -3
data/lib/treat/lexicalizers/tag/brill.rb +3 -11
data/lib/treat/lexicalizers/tag/lingua.rb +4 -6
data/lib/treat/lexicalizers.rb +0 -2
data/lib/treat/processors/chunkers/txt.rb +4 -4
data/lib/treat/processors/parsers/enju.rb +3 -17
data/lib/treat/processors/parsers/stanford.rb +4 -0
data/lib/treat/processors/segmenters/punkt.rb +1 -0
data/lib/treat/processors/segmenters/stanford.rb +4 -0
data/lib/treat/processors/segmenters/tactful.rb +4 -1
data/lib/treat/processors/tokenizers/punkt.rb +1 -2
data/lib/treat/processors/tokenizers/stanford.rb +4 -0
data/lib/treat/processors/tokenizers/tactful.rb +1 -1
data/lib/treat/processors.rb +4 -4
data/lib/treat/proxies.rb +18 -11
data/lib/treat/registrable.rb +12 -5
data/lib/treat/sugar.rb +8 -3
data/lib/treat/tree.rb +10 -3
data/lib/treat.rb +55 -55
data/test/tc_entity.rb +7 -7
data/test/tc_extractors.rb +6 -4
data/test/tc_formatters.rb +0 -4
data/test/tests.rb +2 -0
data/test/texts.rb +4 -4
metadata +48 -56
data/examples/texts/bugged_out.txt +0 -26
data/examples/texts/half_cocked_basel.txt +0 -16
data/examples/texts/hedge_funds.txt +0 -24
data/examples/texts/hose_and_dry.txt +0 -19
data/examples/texts/hungarys_troubles.txt +0 -46
data/examples/texts/indias_slowdown.txt +0 -15
data/examples/texts/merkozy_rides_again.txt +0 -24
data/examples/texts/prada_is_not_walmart.txt +0 -9
data/examples/texts/republican_nomination.txt +0 -26
data/examples/texts/to_infinity_and_beyond.txt +0 -15
data/lib/treat/entities/text.rb +0 -7
data/lib/treat/extractors/key_sentences/topics_frequency.rb +0 -49
data/lib/treat/formatters/cleaners/html.rb +0 -17

data/lib/treat/lexicalizers.rb CHANGED Viewed

@@ -29,7 +29,6 @@ module Treat
       extend Group
       self.type = :annotator
       self.targets = [:word, :number]
       def self.synonyms(entity, synsets)
         synsets.collect { |ss| ss.synonyms }.flatten - [entity.value]
       end
@@ -42,7 +41,6 @@ module Treat
       def self.hypernyms(entity, synsets)
         synsets.collect { |ss| ss.hypernyms }.flatten
       end
     end
     extend Treat::Category
   end

data/lib/treat/processors/chunkers/txt.rb CHANGED Viewed

@@ -5,18 +5,18 @@ module Treat
       # zones based on a very naive analysis of the
       # file.
       class Txt
-        # Return an array of Zone objects found in the text.
+        # Split a document into Zone objects.
         def self.chunk(text, options = {})
           zones = text.to_s.split("\n")
           zones.each do |zone|
             next if zone.strip == ''
             if false # fix
-              text << Entities::List.new(zone)
+              text << Treat::Entities::List.new(zone)
             end
             if zone.length < 60
-              text << Entities::Title.new(zone)
+              text << Treat::Entities::Title.new(zone)
             else
-              text << Entities::Paragraph.new(zone)
+              text << Treat::Entities::Paragraph.new(zone)
             end
           end
           text

data/lib/treat/processors/parsers/enju.rb CHANGED Viewed

@@ -20,21 +20,6 @@ module Treat
         @@i = 0
         # Require the Nokogiri XML parser.
         require 'nokogiri'
-        # Maps Enju categories to Treat categories.
-        CategoryMap = {
-          'ADJ' => :adjective,
-          'ADV' => :adverb,
-          'CONJ' => :conjunction,
-          'COOD' => :conjunction,
-          'C' => :complementizer,
-          'D' => :determiner,
-          'N' => :noun,
-          'P' => :preposition,
-          'PN' => :punctuation,
-          'SC' => :conjunction,
-          'V' => :verb,
-          'PRT' => :particle
-        }
         # Return the process running Enju.
         def self.proc
           if @@parsers.size < @@options[:processes]
@@ -55,7 +40,8 @@ module Treat
             text = entity.to_s + '.'
           else
             remove_last = false
-            text = entity.to_s.gsub('.', '') + '.' # Fix
+            text = entity.to_s.gsub('.', '')
+            text += '.' unless ['!', '?'].include?(text[-1])
           end
           stdin.puts(text + "\n")
           parsed = build(stdout.gets, remove_last)
@@ -114,7 +100,7 @@ module Treat
                         new_attributes[:saturated] = (value[-1] == 'P')
                         value = value[0..-2]
                       end
-                      cat = CategoryMap[value]
+                      cat = Treat::Languages::English::EnjuCatToCategory[value]
                       new_attributes[:cat] = cat
                     else
                       new_attributes[:enju_cat] = value

data/lib/treat/processors/parsers/stanford.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 module Treat
   module Processors
     module Parsers
+      # A wrapper class for the Stanford parser.
       class Stanford
         # Require the Ruby-Java bridge.
         silence_warnings { require 'rjb' }
@@ -13,6 +14,7 @@ module Treat
         Rjb::load(jars[0], ['-Xms256M', '-Xmx512M'])
         LexicalizedParser = ::Rjb::import('edu.stanford.nlp.parser.lexparser.LexicalizedParser')
         @@parsers = {}
+        # Parse the entity using the Stanford parser.
         def self.parse(entity, options = {})
           lang = Treat::Languages.describe(entity.language).to_s.upcase
           pcfg = "#{Treat.bin}/stanford-parser*/grammar/#{lang.upcase}PCFG.ser.gz"
@@ -26,6 +28,8 @@ module Treat
           recurse(parse, entity)
           entity
         end
+        # Helper method which recurses the tree supplied by
+        # the Stanford parser.
         def self.recurse(java_node, ruby_node)
           # Leaf
           if java_node.num_children == 0

data/lib/treat/processors/segmenters/punkt.rb CHANGED Viewed

@@ -21,6 +21,7 @@ module Treat
         # Segment a text using the Punkt segmenter gem.
         #
         # Options:
+        #
         #   :training_text => (String) Text to train the segmenter on.
         def self.segment(entity, options = {})
           lang = entity.language

data/lib/treat/processors/segmenters/stanford.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module Treat
   module Processors
     module Segmenters
+      # A wrapper for the sentence splitter supplied by
+      # the Stanford parser.
       class Stanford
         # Require the Ruby-Java bridge.
         silence_warnings do
@@ -16,6 +18,8 @@ module Treat
           ::Rjb::import('edu.stanford.nlp.process.DocumentPreprocessor')
           StringReader = ::Rjb::import('java.io.StringReader')
         end
+        # Segment sentences using the sentence splitter supplied by
+        # the Stanford parser.
         def self.segment(entity, options = {})
           sr = StringReader.new(entity.to_s)
           sit = DocumentPreprocessor.new(sr).iterator

data/lib/treat/processors/segmenters/tactful.rb CHANGED Viewed

@@ -8,7 +8,10 @@ module Treat
       # based on Splitta, but has support for ‘?’ and ‘!’
       # as well as primitive handling of XHTML markup.
       #
-      # Project website:
+      # Project website: https://github.com/SlyShy/Tackful-Tokenizer
+      # Original paper: Dan Gillick. 2009. Sentence Boundary Detection
+      # and the Problem with the U.S. University of California, Berkeley.
+      # http://dgillick.com/resource/sbd_naacl_2009.pdf
       class Tactful
         # Require the 'tactful_tokenizer' gem.
         silence_warnings { require 'tactful_tokenizer' }

data/lib/treat/processors/tokenizers/punkt.rb CHANGED Viewed

@@ -26,12 +26,11 @@ module Treat
         ReWordTokenizer = /#{ReMultiCharPunct}|(?=#{ReWordStart})\S+?(?=\s|$|#{ReNonWordChars}|#{ReMultiCharPunct}|,(?=$|\s|#{ReNonWordChars}|#{ReMultiCharPunct}))|\S/
         RePeriodContext = /\S*#{ReSentEndChars}(?=(?<after_tok>#{ReNonWordChars}|\s+(?<next_tok>\S+)))/
         # Tokenize the text using the algorithm lifted from
-        # the Punkt tokenizer.
+        # the Punkt tokenizer gem.
         #
         # Options: none.
         def self.tokenize(entity, options = {})
           entity.to_s.scan(ReWordTokenizer).each do |token|
-            puts token
             entity << Treat::Entities::Entity.from_string(token)
           end
           entity

data/lib/treat/processors/tokenizers/stanford.rb CHANGED Viewed

@@ -1,6 +1,8 @@
 module Treat
   module Processors
     module Tokenizers
+      # A wrapper for the Stanford parser's Penn-Treebank
+      # style tokenizer.
       class Stanford
         # Require the Ruby-Java bridge.
         silence_warnings do
@@ -18,6 +20,8 @@ module Treat
           CoreLabelTokenFactory = ::Rjb::import('edu.stanford.nlp.process.CoreLabelTokenFactory')
           StringReader = ::Rjb::import('java.io.StringReader')
         end
+        # Tokenize the entity using a Penn-Treebank style tokenizer
+        # included with the Stanford Parser.
         def self.tokenize(entity, options = {})
           ptbt = PTBTokenizer.new(
             StringReader.new(entity.to_s),

data/lib/treat/processors/tokenizers/tactful.rb CHANGED Viewed

@@ -41,7 +41,7 @@ module Treat
           [/([Cc])annot/, '\1an not']
         ]
         # Tokenize the entity using a rule-based algorithm
-        # which has been lifted from the 'tactful-tokenizer'
+        # that has been lifted from the 'tactful-tokenizer'
         # gem.
         def self.tokenize(entity, options = {})
           s = entity.to_s

data/lib/treat/processors.rb CHANGED Viewed

@@ -17,19 +17,19 @@ module Treat
     module Chunkers
       extend Group
       self.type = :transformer
-      self.targets = [:document, :text]
+      self.targets = [:document, :zone]
     end
     # Segmenters split a text or zone into sentences.
     module Segmenters
       extend Group
       self.type = :transformer
-      self.targets = [:document, :text, :zone]
+      self.targets = [:document, :zone]
     end
     # Tokenizers splits a sentence into Token objects.
     module Tokenizers
       extend Group
       self.type = :transformer
-      self.targets = [:document, :text, :zone, :sentence, :constituent]
+      self.targets = [:document, :zone, :sentence, :constituent]
     end
     # Parsers split a sentence into constituent objects
     # representing its syntactic structure, with the
@@ -37,7 +37,7 @@ module Treat
     module Parsers
       extend Group
       self.type = :transformer
-      self.targets = [:document, :text, :zone, :sentence, :constituent]
+      self.targets = [:document, :zone, :sentence, :constituent]
     end
     # Makes all the groups autoloadable and creates the delegators.
     extend Treat::Category

data/lib/treat/proxies.rb CHANGED Viewed

@@ -1,11 +1,13 @@
 module Treat
-  # Proxies install Treat functions on Rubycore classes.
+  # Proxies install Treat functions on core Ruby classes.
   module Proxies
     # The module proxy provides functionanaty common
     # to the different types of proxies.
     module Proxy
+      # Build the entity corresponding to the proxied
+      # object and send the method call to the entity.
       def method_missing(sym, *args, &block)
-        if Categories.have_method?(sym)
+        if Treat::Categories.have_method?(sym)
           to_entity.send(sym, *args)
         else
           super(sym, *args, &block)
@@ -16,8 +18,8 @@ module Treat
       end
     end
     # Install Treat functions on String objects.
-    module StringProxy
-      include Proxy
+    module String
+      include Treat::Proxies::Proxy
       # Save the string to the specified file.
       def save(file)
         File.open(file, 'w') { |f| f.write(self) }
@@ -28,16 +30,21 @@ module Treat
       end
     end
     # Install Treat functions on Numeric objects.
-    module NumericProxy
-      include Proxy
+    module Numeric
+      include Treat::Proxies::Proxy
       # Return the entity corresponding to the number.
       def to_entity(builder = nil)
         Treat::Entities::Entity.from_numeric(self)
       end
     end
     # Install Treat functions on Array objects.
-    module ArrayProxy
-      include Proxy
+    module Array
+      include Treat::Proxies::Proxy
+      # The behaviour of this proxy is special:
+      # if a Treat function is called on an array,
+      # the function will be called on each element
+      # of the array and a new array with the
+      # results will be returned.
       def method_missing(sym, *args, &block)
         if Category.has_method?(sym)
           array = []
@@ -59,8 +66,8 @@ module Treat
       end
     end
     # Include the proxies in the core classes.
-    String.class_eval { include StringProxy }
-    Numeric.class_eval { include NumericProxy }
-    Array.class_eval { include ArrayProxy }
+    ::String.class_eval { include Treat::Proxies::String }
+    ::Numeric.class_eval { include Treat::Proxies::Numeric }
+    ::Array.class_eval { include Treat::Proxies::Array }
   end
 end

data/lib/treat/registrable.rb CHANGED Viewed

@@ -3,20 +3,27 @@ module Treat
     # Registers a token in the @token_registry
     # hash in the root node.
     def register_token(token)
-      if is_root?
+      if is_root? || type == :document
         @token_registry ||= {value: {}, id: {}}
         @token_registry[:id][token.id] = token
-        @token_registry[:value][token.value] ||= []
-        @token_registry[:value][token.value] << token
+        @token_registry[:value][token.to_s] ||= []
+        @token_registry[:value][token.to_s] << token
+        if has_parent? && type == :document
+          @parent.register_token(token)
+        end
       else
         @parent.register_token(token)
       end
     end
     # Find the token registry, which is
     # always in the root node.
-    def token_registry
+    def token_registry(type = nil)
+      if self.type == type
+        @token_registry ||= {value: {}, id: {}}
+        return @token_registry
+      end
       if has_parent?
-        @parent.token_registry
+        @parent.token_registry(type)
       else
         @token_registry ||= {value: {}, id: {}}
         @token_registry

data/lib/treat/sugar.rb CHANGED Viewed

@@ -1,5 +1,10 @@
 module Treat
+  # This module provides syntactic sugar in the following manner:
+  # all entities found under Treat::Entities will be made
+  # available within the global namespace. For example,
+  # Treat::Entities::Word can now be referred to as simply 'Word'.
   module Sugar
+    # Installs syntactic sugar.
     def edulcorate
       return if @@edulcorated
       @@edulcorated = true
@@ -13,6 +18,7 @@ module Treat
         end
       end
     end
+    # Uninstalls syntactic sugar.
     def unedulcorate
       return unless @@edulcorated
       @@edulcorated = false
@@ -24,14 +30,13 @@ module Treat
         end
       end
     end
-    # Whtypeher syntactic sugar is
+    # Boolean - whether syntactic sugar is
     # enabled or not.
     def edulcorated?; @@edulcorated; end
     # Syntactic sugar is disabled by default.
     @@edulcorated = false
     private
+    # Helper method, yields each entity type and class.
     def each_entity_class
       Treat::Entities.list.each do |entity_type|
         type = :"#{cc(entity_type)}"

data/lib/treat/tree.rb CHANGED Viewed

@@ -68,18 +68,20 @@ module Treat
         end
         nodes[0]
       end
+      # Retrieve a child node by name or index.
       def [](name_or_index)
         if name_or_index == nil
           raise Treat::Exception,
           "Non-nil name or index needs to be provided."
         end
         if name_or_index.kind_of?(Integer) &&
-          name_or_index < 1000            # Fix
+          name_or_index < 1000
           @children[name_or_index]
         else
           @children_hash[name_or_index]
         end
       end
+      # Remove the supplied node or id of a node from the children.
       def remove!(ion)
         return nil unless ion
         if ion.is_a? Treat::Tree::Node
@@ -91,6 +93,7 @@ module Treat
           @children_hash.delete(ion)
         end
       end
+      # Remove all children.
       def remove_all!
         @children.each { |child| child.set_as_root! }
         @children.clear
@@ -103,14 +106,18 @@ module Treat
         id = @parent.children.index(self)
         @parent.children.at(id + 1) if id
       end
+      # Return the sibling N positions to the left of this one.
       def left(n = 1); sibling(-1*n); end
+      # Return the sibling N positions to the right of this one.
       def right(n = 1); sibling(1*n); end
+      # Return the sibling with position #pos versus
+      # this one. #pos can be ... -1, 0, 1, ...
       def sibling(pos)
         return nil if is_root?
         id = @parent.children.index(self)
         @parent.children.at(id + pos)
       end
-      # There must be a cleaner way to do this.
+      # Return all brothers and sisters of this node.
       def siblings
         r = @parent.children.dup
         r.delete(self)
@@ -133,7 +140,7 @@ module Treat
       # Does the entity have a feature ?
       def has_feature?(feature)
         @features.has_key?(feature) ||
-        feature == :value
+        [:id, :value, :children, :edges].include?(feature)
       end
       alias :has? :has_feature?
       # Link this node to the target node with

data/lib/treat.rb CHANGED Viewed

@@ -1,51 +1,50 @@
-#
 # Main namespace for Treat modules.
 #
-# 1. Entities
+# === Entities
 #
-#   Entities are Tree structures that represent any textual
-#   entity (from a collection of texts down to an individual
-#   word) with a value, features, children and edges linking
-#   it to other textual entities. Sugar provides syntactic sugar
-#   for Entities and can be enabled by running Treat.edulcorate.
+# Entities are Tree structures that represent any textual
+# entity (from a collection of texts down to an individual
+# word) with a value, features, children and edges linking
+# it to other textual entities. Sugar provides syntactic sugar
+# for Entities and can be enabled by running Treat.edulcorate.
 #
-#   Here are some example of how to create entities:
+# Here are some example of how to create entities:
 #
-#   c = Collection 'folder_with_documents'
-#   d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
-#   p = Paragraph 'A short story. The end.'
-#   s = Sentence 'That is not a sentence.'
-#   w = Word 'fox'
+#     c = Collection 'folder_with_documents'
+#     d = Document 'filename.txt' # (or PDF, html, xml, png, jpg, gif).
+#     p = Paragraph 'A short story. The end.'
+#     s = Sentence 'That is not a sentence.'
+#     w = Word 'fox'
 #
-#   Here's a full list of entities (subtypes in parentheses):
-#   Collection, Document, Zone (Section, Title, Paragraph or List),
-#   Sentence, Constituent (Phrase or Clause), Token (Word, Number,
-#   Symbol or Punctuation).
+# Here's a full list of entities (subtypes in parentheses):
+# Collection, Document, Zone (Section, Title, Paragraph or List),
+# Sentence, Constituent (Phrase or Clause), Token (Word, Number,
+# Symbol or Punctuation).
+#
+# === Proxies
 #
-# 2. Proxies
+# Proxies allow the Treat functions to be called on the core
+# Ruby classes String, Numeric and Array. They build the entity
+# corresponding to the supplied raw text and send the requested
+# function to it.
 #
-#   Proxies allow the Treat functions to be called on the core
-#   Ruby classes String, Numeric and Array. They build the entity
-#   corresponding to the supplied raw text and send the requested
-#   function to it.
-#
-#   For example,
+# For example,
 #
-#       'fox'.tag
+#     'fox'.tag
 #
-#   Is equivalent to:
+# Is equivalent to:
 #
-#       w = Word 'fox'
-#       w.tag
+#     w = Word 'fox'
+#     w.tag
 #
-# 3. Functions
+# === Functions
 #
-#   A class is defined for each implemented algorithm performing a given
-#   task. These classes are clustered into groups of algorithms performing
-#   the same given task (Group), and the groups are clustered into Categories
-#   of groups performing related tasks.
+# A class is defined for each implemented algorithm performing a given
+# task. These classes are clustered into groups of algorithms performing
+# the same given task (Group), and the groups are clustered into Categories
+# of groups performing related tasks.
 #
-#   Here are the different Categories:
+# Here are the different Categories:
 #
 # - Detectors - Category for language, encoding, and format
 #   detectors.
@@ -60,22 +59,22 @@
 # - Processors - Namespace for algorithms that process collections and
 #   documents into trees.
 #
-# 3. Linguistic resources
+# === Linguistic resources
 #
-#   The Languages module contains linguistic information about
-#   languages (full ISO-639-1 and 2 language list, tag alignments
-#   for three treebanks, word categories, etc.)
+# The Languages module contains linguistic information about
+# languages (full ISO-639-1 and 2 language list, tag alignments
+# for three treebanks, word categories, etc.)
 #
-# 4. Mixins for entities.
+# === Mixins for entities.
 #
-#  Buildable, Delegatable, Visitable and Registrable are
-#  or extended by Entity and provide it with the ability to be built,
-#  to delegate function calls, to accept visitors and to maintain a
-#  token registry, respectively.
+# Buildable, Delegatable, Visitable and Registrable are
+# or extended by Entity and provide it with the ability to be built,
+# to delegate function calls, to accept visitors and to maintain a
+# token registry, respectively.
 #
-# 5. Exception
+# === Exception class.
 #
-#  Exception defines a custom exception for the Treat module.
+# Exception defines a custom exception class for the Treat module.
 #
 module Treat
@@ -85,20 +84,20 @@ module Treat
   end
   # The current version of Treat.
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
-#  $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
+  # $LOAD_PATH << '/ruby/treat/lib/' # Remove for release
   # Create class variables for the Treat module.
   class << self
-    # Default language to use when detect_language is false
+    # Symbol - default language to use when detect_language is false.
     attr_accessor :default_language
-    # Default encoding to use.
+    # Symbol - default encoding to use.
     attr_accessor :default_encoding
     # Boolean - detect language or use default?
     attr_accessor :detect_language
-    # Identifier - the ideal entity level to detect language at
-    # (:entity, :sentence, :zone, :text, :document, klass.)
+    # Symbol - the ideal entity level to detect language at
+    # (e.g., :entity, :sentence, :zone, :section, :document)
     attr_accessor :language_detection_level
     # String - main folder for executable files.
     attr_accessor :bin
@@ -117,13 +116,13 @@ module Treat
   # Turn language detection off by default.
   self.detect_language = false
   # Detect the language once per text by default.
-  self.language_detection_level = :text
+  self.language_detection_level = :section
   # Set the lib path to that of this file.
   self.lib = File.dirname(__FILE__)
   # Set the paths to the bin, test and tmp folders.
-  self.bin = self.lib + '/../bin/'
-  self.test = self.lib + '/../test/'
-  self.tmp = self.lib + '/../tmp/'
+  self.bin = self.lib + '/../bin'
+  self.test = self.lib + '/../test'
+  self.tmp = self.lib + '/../tmp'
   # Require modified core classes.
   require 'treat/object'
@@ -137,6 +136,7 @@ module Treat
   require 'treat/proxies'
   require 'treat/sugar'
+  # Make sugar available when needed.
   extend Sugar
 end

data/test/tc_entity.rb CHANGED Viewed

@@ -2,7 +2,7 @@ module Treat
   module Tests
     class TestEntity < Test::Unit::TestCase
       def setup
-        @text = Treat::Entities::Text.new
+        @text = Treat::Entities::Section.new
         @sentence = Treat::Entities::Sentence.new
@@ -14,23 +14,23 @@ module Treat
         @adj_phrase.set :tag, 'ADJP'
         @det = Treat::Entities::Word.new('The')
-        @det.set :cat, :determiner
+        @det.set :category, :determiner
         @det.set :tag, 'DT'
         @det.set :tag_set, :penn
         @adj = Treat::Entities::Word.new('lazy')
-        @adj.set :cat, :adjective
+        @adj.set :category, :adjective
         @adj.set :tag, 'JJ'
         @adj.set :tag_set, :penn
         @noun = Treat::Entities::Word.new('fox')
-        @noun.set :cat, :noun
+        @noun.set :category, :noun
         @noun.set :tag, 'NN'
         @noun.set :tag_set, :penn
         @aux = Treat::Entities::Word.new('is')
-        @aux.set :cat, :verb
+        @aux.set :category, :verb
         @aux.set :tag, 'VBZ'
         @aux.set :tag_set, :penn
         @verb = Treat::Entities::Word.new('running')
-        @verb.set :cat, :verb
+        @verb.set :category, :verb
         @verb.set :tag, 'VBG'
         @verb.set :tag_set, :penn
         @dot = Treat::Entities::Punctuation.new('.')
@@ -62,7 +62,7 @@ module Treat
       end
       def test_type
-        assert_equal :text, @text.type
+        assert_equal :section, @text.type
       end
       def test_printers

data/test/tc_extractors.rb CHANGED Viewed

@@ -25,9 +25,11 @@ module Treat
         # assert_nothing_raised { @doc.named_entity(:abner) }
       end
-      def test_key_sentences
-        topics = @doc.topic_words(:lda)
-        assert_nothing_raised { @doc.key_sentences(:topics_frequency, topics) }
+      def test_keywords
+        assert_nothing_raised do
+          topics = @doc.topic_words(:lda)
+          @doc.keywords(:topics_frequency, topic_words: topics)
+        end
       end
       def test_topics
@@ -38,7 +40,7 @@ module Treat
         @doc.chunk.segment(:tactful).tokenize
         assert_nothing_raised { @doc.statistics(:frequency_of, value: 'the') }
-        assert_nothing_raised { @word.statistics(:frequency) }
+        assert_nothing_raised { @word.statistics(:frequency_in) }
         # assert_nothing_raised { @doc.statistics(:position_in) }
         # assert_nothing_raised { @doc.statistics(:transition_matrix) }
         # assert_nothing_raised { @doc.statistics(:transition_probability) }

data/test/tc_formatters.rb CHANGED Viewed

@@ -37,10 +37,6 @@ module Treat
         assert_nothing_raised { @sentence.visualize(:standoff) }
       end
-      def test_cleaners
-        assert_nothing_raised { @html_doc.clean(:html) }
-      end
     end
   end
 end