RubyGems - treat - Versions diffs - 1.0.4 → 1.0.5 - Mend

treat 1.0.4 → 1.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

data/LICENSE +0 -1
data/files/INFO +1 -1
data/lib/treat/entities/abilities/buildable.rb +2 -6
data/lib/treat/entities/abilities/checkable.rb +2 -2
data/lib/treat/entities/abilities/delegatable.rb +2 -2
data/lib/treat/entities/abilities/doable.rb +6 -1
data/lib/treat/entities/abilities/iterable.rb +8 -0
data/lib/treat/entities/abilities/magical.rb +1 -1
data/lib/treat/extractors.rb +1 -1
data/lib/treat/formatters/visualizers/standoff.rb +1 -1
data/lib/treat/groupable.rb +4 -0
data/lib/treat/installer.rb +33 -13
data/lib/treat/kernel.rb +0 -4
data/lib/treat/languages/arabic.rb +1 -1
data/lib/treat/languages/chinese.rb +1 -1
data/lib/treat/languages/dutch.rb +1 -1
data/lib/treat/languages/english.rb +1 -1
data/lib/treat/languages/french.rb +4 -4
data/lib/treat/languages/german.rb +3 -3
data/lib/treat/languages/italian.rb +1 -1
data/lib/treat/{linguistics/categories.rb → languages/language.rb} +3 -4
data/lib/treat/languages/polish.rb +1 -1
data/lib/treat/languages/portuguese.rb +1 -1
data/lib/treat/languages/russian.rb +1 -1
data/lib/treat/languages/spanish.rb +1 -1
data/lib/treat/languages/swedish.rb +1 -1
data/lib/treat/lexicalizers/categorizers/from_tag.rb +14 -10
data/lib/treat/lexicalizers/taggers/brill.rb +1 -1
data/lib/treat/lexicalizers/taggers/stanford.rb +5 -2
data/lib/treat/lexicalizers.rb +2 -1
data/lib/treat/processors/parsers/enju.rb +2 -2
data/lib/treat/processors/parsers/stanford.rb +17 -11
data/lib/treat/processors/segmenters/punkt.rb +5 -2
data/lib/treat/processors/segmenters/tactful.rb +5 -1
data/lib/treat/processors/tokenizers/ptb.rb +11 -3
data/lib/treat/processors/tokenizers/punkt.rb +0 -3
data/lib/treat/processors/tokenizers/tactful.rb +3 -0
data/lib/treat/universalisation/encodings.rb +12 -0
data/lib/treat/{linguistics → universalisation}/tags.rb +77 -46
data/lib/treat/universalisation.rb +9 -0
data/lib/treat.rb +2 -2
metadata +6 -6
data/lib/treat/linguistics.rb +0 -9
data/lib/treat/processors/tokenizers/perl.rb +0 -132

data/LICENSE CHANGED Viewed

@@ -20,7 +20,6 @@ Non-trivial amount of code has been incorporated and modified from other librari
 - formatters/readers/odt.rb - Mark Watson (GPL license)
 - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
 - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
-- processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
 - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
 - extractors/topics/reuters.rb - Mark Watson (GPL license)
 - inflectors/declensions/english.rb - Thomas Sawyer (MIT license)

data/files/INFO CHANGED Viewed

	@@ -1 +1 @@
1	- This is a folder containing the files downloaded by Treat.
1	+ This is a folder containing the files downloaded by Treat from the internet.

data/lib/treat/entities/abilities/buildable.rb CHANGED Viewed

@@ -4,12 +4,11 @@
 # is pretty much self-explanatory.
 module Treat::Entities::Abilities::Buildable
-  require 'treat/helpers/decimal_point_escaper'
   require 'fileutils'
   # Simple regexps to match common entities.
   WordRegexp = /^[[:alpha:]\-']+$/
-  NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
+  NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
   PunctRegexp = /^[[:punct:]\$]+$/
   UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
   EmailRegexp = /.+\@.+\..+/
@@ -57,8 +56,6 @@ module Treat::Entities::Abilities::Buildable
   # instead of from_string directly).
   def from_string(string, enforce_type = false)
-    Treat::Helpers::DecimalPointEscaper.escape!(string)
     enforce_type = true if caller_method == :build
     unless self == Treat::Entities::Entity
@@ -74,6 +71,7 @@ module Treat::Entities::Abilities::Buildable
     end
     e
   end
   # Build a document from an URL.
@@ -116,7 +114,6 @@ module Treat::Entities::Abilities::Buildable
       "a numeric object."
     end
     n = numeric.to_s
-    Treat::Helpers::DecimalPointEscaper.unescape!(n)
     Treat::Entities::Number.new(n)
   end
@@ -319,7 +316,6 @@ module Treat::Entities::Abilities::Buildable
   end
   def create_collection(fv)
-    debug("Creating new collection in directory #{fv}.")
     FileUtils.mkdir(fv)
     Treat::Entities::Collection.new(fv)
   end

data/lib/treat/entities/abilities/checkable.rb CHANGED Viewed

@@ -24,8 +24,8 @@ module Treat::Entities::Abilities::Checkable
     return unless has_children?
     raise Treat::Exception,
     "Warning: can't #{caller_method(2)} "+
-    "an entity that has children. Removing " +
-    " all children of text \"[#{short_value}].\""
+    "the text \"#{short_value}\", because it " +
+    "already has children."
   end
 end

data/lib/treat/entities/abilities/delegatable.rb CHANGED Viewed

@@ -104,9 +104,9 @@ module Treat::Entities::Abilities::Delegatable
     if !klass[g] || !klass[g][0]
       d = ucc(cl(group))
       d.gsub!('_', ' ')
-      d = 'worker to find "' + d
+      d = d[0..-2]
       raise Treat::Exception, "No #{d}" +
-      "\" is available for the " +
+      " is available for the " +
       "#{lang.to_s.capitalize} language."
     end
     return klass[g][0]

data/lib/treat/entities/abilities/doable.rb CHANGED Viewed

@@ -37,8 +37,13 @@ module Treat::Entities::Abilities::Doable
     end
     if f || entity_types.include?(:entity)
       send(task, worker, options)
+      if group.recursive
+        each do |entity|
+          entity.do_task(task, worker, options, group)
+        end
+      end
     else
-      each_entity(*entity_types) do |entity|
+      each do |entity|
         entity.do_task(task, worker, options, group)
       end
       unless entity_types.include?(type)

data/lib/treat/entities/abilities/iterable.rb CHANGED Viewed

@@ -95,6 +95,14 @@ module Treat::Entities::Abilities::Iterable
     as
   end
+  # Returns the first ancestor that has a feature
+  # with the given name, otherwise nil.
+  def ancestor_with_feature(type, feature)
+    each_ancestor(type) do |ancestor|
+      return ancestor if ancestor.has?(feature)
+    end
+  end
   alias :ancestors_with_type :ancestors_with_types
   # Number of children that have a given feature.

data/lib/treat/entities/abilities/magical.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
   def magic(sym, *args)
     @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
-    @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
+    @@cats_regexp ||= "(#{Treat::Languages::Language::WordCategories.join('|')})"
     method = sym.to_s =~ /entities/ ?
     sym.to_s.gsub('entities', 'entitys') :

data/lib/treat/extractors.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Treat::Extractors
   module Keywords
     extend Treat::Groupable
     self.type = :annotator
-    self.targets = [:document]
+    self.targets = [:document, :section, :zone]
   end
   # Extract clusters of topic words from a collection.

data/lib/treat/formatters/visualizers/standoff.rb CHANGED Viewed

@@ -44,7 +44,7 @@ class Treat::Formatters::Visualizers::Standoff
   end
   def self.ptb_escape(val)
-    Treat::Linguistics::Tags::
+    Treat::Universalisation::Tags::
     PTBEscapeCharacters.each do |char, esc|
       val.gsub!(char, val)
     end

data/lib/treat/groupable.rb CHANGED Viewed

@@ -95,8 +95,12 @@ module Treat::Groupable
         attr_accessor :presets
         # The preset option to use with preset functions.
         attr_accessor :preset_option
+        # Whether to recurse within multiple targets or not.
+        attr_accessor :recursive
       end
+      self.recursive = false
       # Return the method corresponding to the group.
       # This method resolves the name of the method
       # that a group should provide based on the name

data/lib/treat/installer.rb CHANGED Viewed

@@ -82,9 +82,13 @@ module Treat::Installer
       begin
         Gem::Specification.find_by_name('punkt-segmenter')
         title "Downloading model for the Punkt segmenter for the #{l}."
-        download_punkt_models(language)
+        # Need fix
+        download_punkt_models([language.to_s])
       rescue Gem::LoadError; end
+      # Download reuters models always
+      download_reuters_models
       # If stanford is installed, download models.
       begin
         Gem::Specification.find_by_name('stanford-core-nlp')
@@ -92,7 +96,10 @@ module Treat::Installer
         "model files for the the #{l}.\n\n"
         package = (language == :english) ? :english : :all
         download_stanford(package)
-      rescue Gem::LoadError; end
+      rescue Gem::LoadError
+        puts 'Stanford-core-nlp gem not installed.'
+        puts 'Skipping download of Stanford models.'
+      end
       title "Install external binary libraries " +
             "(requires port, apt-get or win-get).\n"
@@ -124,7 +131,7 @@ module Treat::Installer
     install_dependencies(false)
     install_language_dependencies(dep, false)
     download_stanford(:minimal)
-    download_punkt_models(:english)
+    download_punkt_models([:english])
   end
   def self.install_dependencies(optionally)
@@ -166,7 +173,7 @@ module Treat::Installer
     unless man
       puts "Skipping installation of the "+
       "following binaries:\n\n"
-      Binaries.each do |binary, purpose|
+      Binary.each do |binary, purpose|
         puts "- #{binary} to #{purpose}"
       end
       return
@@ -227,22 +234,35 @@ module Treat::Installer
   end
-  def self.download_punkt_models(language)
+  def self.download_punkt_models(languages)
+    languages.map! { |l| "#{l}.yaml" }
+    download_models 'punkt', languages
+  end
+  def self.download_reuters_models
+    files = ["industry.xml", "region.xml", "topics.xml"]
+    download_models 'reuters', files
+  end
-    f = "#{language}.yaml"
-    dest = "#{Treat.models}punkt/"
+  def self.download_models(directory, files)
-    loc = Treat::Downloader.download(
-    'http', Server, 'treat/punkt', f, Treat.tmp)
+    dest = "#{Treat.models}#{directory}/"
     unless File.readable?(dest)
-      puts "- Creating directory models/punkt ..."
+      puts "- Creating directory models/#{directory} ..."
       FileUtils.mkdir_p(File.absolute_path(dest))
     end
-    puts "- Copying model file to models/punkt ..."
-    FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
+    files.each do |file|
+      puts "- Downloading #{file} ..."
+      loc = Treat::Downloader.download(
+      'http', Server, "treat/#{directory}", file, Treat.tmp)
+      puts "- Copying file to models/#{directory} ..."
+      FileUtils.cp(loc, File.join(Paths[:models], directory, file))
+    end
     puts "- Cleaning up..."
     FileUtils.rm_rf(Paths[:tmp] + Server)

data/lib/treat/kernel.rb CHANGED Viewed

@@ -181,10 +181,6 @@ module Kernel
     NULL_DEVICE = '/dev/null'
   end
-  def debug(msg)
-    puts msg if Treat.debug
-  end
   def prompt(msg, valid_answers)
     msg = msg

data/lib/treat/languages/arabic.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class Treat::Languages::Arabic
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford]
+    :taggers => [:stanford]
   }
   Processors = {
     :parsers => [:stanford]

data/lib/treat/languages/chinese.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class Treat::Languages::Chinese
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford]
+    :taggers => [:stanford]
   }
   Processors = {
     :parsers => [:stanford]

data/lib/treat/languages/dutch.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Dutch
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/english.rb CHANGED Viewed

@@ -31,7 +31,7 @@ class Treat::Languages::English
     :chunkers => [:txt],
     :parsers => [:stanford, :enju],
     :segmenters => [:tactful, :punkt, :stanford],
-    :tokenizers => [:perl, :ptb, :stanford, :tactful, :punkt]
+    :tokenizers => [:ptb, :stanford, :tactful, :punkt]
   }
   Retrievers = {

data/lib/treat/languages/french.rb CHANGED Viewed

@@ -6,14 +6,14 @@ class Treat::Languages::French
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford],
-    :category => [:from_tag]
+    :taggers => [:stanford],
+    :categorizers => [:from_tag]
   }
   Processors = {
     :chunkers => [:txt],
     :parsers => [:stanford],
-    :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :segmenters => [:tactful],
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/german.rb CHANGED Viewed

@@ -6,14 +6,14 @@ class Treat::Languages::German
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford],
-    :category => [:from_tag]
+    :taggers => [:stanford],
+    :categorizers => [:from_tag]
   }
   Processors = {
     :chunkers => [:txt],
     :parsers => [:stanford],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/italian.rb CHANGED Viewed

@@ -10,7 +10,7 @@ class Treat::Languages::Italian
     :chunkers => [:txt],
     :parsers => [:stanford],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/{linguistics/categories.rb → languages/language.rb} RENAMED Viewed

@@ -1,11 +1,10 @@
-module Treat::Linguistics
-  # A list of all possible word categories.
+class Treat::Languages::Language
   WordCategories = [
     :adjective, :adverb, :noun, :verb, :interjection,
     :clitic, :coverb, :conjunction, :determiner, :particle,
     :preposition, :pronoun, :number, :symbol, :punctuation,
     :complementizer
   ]
 end

data/lib/treat/languages/polish.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Polish
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/portuguese.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Portuguese
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/russian.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Russian
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/spanish.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Spanish
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/swedish.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Swedish
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/lexicalizers/categorizers/from_tag.rb CHANGED Viewed

@@ -3,17 +3,19 @@
 # from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
 class Treat::Lexicalizers::Categorizers::FromTag
-  Pttc = Treat::Linguistics::Tags::PhraseTagToCategory
-  Wttc = Treat::Linguistics::Tags::WordTagToCategory
-  Ptc = Treat::Linguistics::Tags::PunctuationToCategory
+  Pttc = Treat::Universalisation::Tags::PhraseTagToCategory
+  Wttc = Treat::Universalisation::Tags::WordTagToCategory
+  Ptc = Treat::Universalisation::Tags::PunctuationToCategory
   # Find the category of the entity from its tag.
   def self.category(entity, options = {})
     tag = entity.check_has(:tag)
     return :unknown if tag.nil? || tag == '' || entity.type == :symbol
     return :sentence if tag == 'S' || entity.type == :sentence
     return :number if entity.type == :number
     return Ptc[entity.to_s] if entity.type == :punctuation
     if entity.is_a?(Treat::Entities::Phrase)
@@ -29,15 +31,17 @@ class Treat::Lexicalizers::Categorizers::FromTag
     if entity.has?(:tag_set)
       ts = entity.get(:tag_set)
-    elsif entity.parent_phrase &&
-      entity.parent_phrase.has?(:tag_set)
-      ts = entity.parent_phrase.get(:tag_set)
     else
-      raise Treat::Exception,
-      "No information can be found regarding "+
-      "which tag set to use."
+      a = entity.ancestor_with_feature(:phrase, :tag_set)
+      if a
+        ts = a.get(:tag_set)
+      else
+        raise Treat::Exception,
+        "No information can be found regarding "+
+        "which tag set to use."
+      end
     end
     if cat[ts]
       return cat[ts]
     else

data/lib/treat/lexicalizers/taggers/brill.rb CHANGED Viewed

@@ -35,7 +35,7 @@ module Treat::Lexicalizers::Taggers::Brill
     # Tokenize the sentence/phrase.
     if !entity.has_children? &&
       !entity.is_a?(Treat::Entities::Token)
-      entity.tokenize(:perl, options)
+      entity.tokenize(options)
     end
     # Create the tagger if necessary

data/lib/treat/lexicalizers/taggers/stanford.rb CHANGED Viewed

@@ -38,11 +38,14 @@ class Treat::Lexicalizers::Taggers::Stanford
     end
     # Handle tags for sentences and phrases.
     if entity.is_a?(Treat::Entities::Sentence) ||
       (entity.is_a?(Treat::Entities::Phrase) &&
       !entity.parent_sentence)
-        entity.set :tag_set, :penn
+        tag_set =  Treat::Universalisation::Tags::
+                    StanfordTagSetForLanguage[
+                   Treat::Languages.describe(lang)]
+        entity.set :tag_set, tag_set
     end
     if entity.is_a?(Treat::Entities::Sentence)

data/lib/treat/lexicalizers.rb CHANGED Viewed

@@ -16,7 +16,8 @@ module Treat::Lexicalizers
   module Categorizers
     extend Treat::Groupable
     self.type = :annotator
-    self.targets = [:token]
+    self.targets = [:sentence, :phrase, :token]
+    self.recursive = true
     self.default = :from_tag
   end

data/lib/treat/processors/parsers/enju.rb CHANGED Viewed

@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
   @@parser = nil
   # A hash of Enju cat tags mapped to word categories.
-  Ectc = Treat::Linguistics::Tags::EnjuCatToCategory
+  Ectc = Treat::Universalisation::Tags::EnjuCatToCategory
   # A hash of Enju cat/xcat pairs mapped to PTB tags.
-  Ecxtp = Treat::Linguistics::Tags::EnjuCatXcatToPTB
+  Ecxtp = Treat::Universalisation::Tags::EnjuCatXcatToPTB
   # Parse the entity into its syntactical
   # phrases using Enju.

data/lib/treat/processors/parsers/stanford.rb CHANGED Viewed

@@ -27,6 +27,10 @@ class Treat::Processors::Parsers::Stanford
     lang = entity.language
     init(lang, options)
+    tag_set =  Treat::Universalisation::Tags::
+               StanfordTagSetForLanguage[
+               Treat::Languages.describe(lang)]
     text = ::StanfordCoreNLP::Text.new(val)
     @@parsers[lang].annotate(text)
@@ -37,17 +41,18 @@ class Treat::Processors::Parsers::Stanford
         tag = s.get(:category).to_s
         tag_s, tag_opt = *tag.split('-')
         tag_s ||= 'S'
-        entity.set :tag_set, :penn
         entity.set :tag, tag_s
         entity.set :tag_opt, tag_opt if tag_opt
-        recurse(s.get(:tree).children[0], entity)
-        break
+        recurse(s.get(:tree).children[0], entity, tag_set)
+        break #######
       else
         recurse(s.get(:tree), entity)
       end
     end
+    entity.set :tag_set, tag_set
   end
   def self.init(lang, options)
@@ -76,7 +81,7 @@ class Treat::Processors::Parsers::Stanford
   # Helper method which recurses the tree supplied by
   # the Stanford parser.
-  def self.recurse(java_node, ruby_node, additional_tags = [])
+  def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
     if java_node.num_children == 0
@@ -85,10 +90,8 @@ class Treat::Processors::Parsers::Stanford
       tag_s, tag_opt = *tag.split('-')
       tag_s ||= ''
       ruby_node.value = java_node.value.to_s.strip
-      ruby_node.set :tag_set, :penn
       ruby_node.set :tag, tag_s
       ruby_node.set :tag_opt, tag_opt if tag_opt
-      ruby_node.set :tag_set, :penn
       ruby_node.set :lemma, label.get(:lemma).to_s
       additional_tags.each do |t|
@@ -103,33 +106,35 @@ class Treat::Processors::Parsers::Stanford
       if java_node.num_children == 1 &&
         java_node.children[0].num_children == 0
         recurse(java_node.children[0],
-        ruby_node, additional_tags)
+        ruby_node, tag_set, additional_tags)
         return
       end
       java_node.children.each do |java_child|
         label = java_child.label
         tag = label.get(:category).to_s
         tag_s, tag_opt = *tag.split('-')
         tag_s ||= ''
-        if Treat::Linguistics::Tags::PhraseTagToCategory[tag_s]
+        if Treat::Universalisation::Tags::PhraseTagToCategory[tag_s] &&
+           Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
           ruby_child = Treat::Entities::Phrase.new
         else
           l = java_child.children[0].to_s
           v = java_child.children[0].value.to_s.strip
           # Mhmhmhmhmhm
           val = (l == v) ? v :  l.split(' ')[-1].gsub(')', '')
           ruby_child = Treat::Entities::Token.from_string(val)
         end
-        ruby_child.set :tag_set, :penn
         ruby_child.set :tag, tag_s
         ruby_child.set :tag_opt, tag_opt if tag_opt
         ruby_node << ruby_child
         unless java_child.children.empty?
-          recurse(java_child, ruby_child, additional_tags)
+          recurse(java_child, ruby_child, tag_set, additional_tags)
         end
       end
@@ -137,4 +142,5 @@ class Treat::Processors::Parsers::Stanford
     end
   end
 end

data/lib/treat/processors/segmenters/punkt.rb CHANGED Viewed

@@ -38,15 +38,18 @@ module Treat::Processors::Segmenters::Punkt
     s = entity.to_s
-    # Replace all decimal points by ^^
+    # Replace the point in all floating-point numbers
+    # by ^^; this is a fix since Punkt trips on decimal
+    # numbers.
     Treat::Helpers::DecimalPointEscaper.escape!(s)
-    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
+    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
     result = @@segmenters[lang].
     sentences_from_text(s,
     :output => :sentences_text)
     result.each do |sentence|
+      # Unescape the sentence.
       Treat::Helpers::DecimalPointEscaper.
       unescape!(sentence)
       entity << Treat::Entities::Phrase.

data/lib/treat/processors/segmenters/tactful.rb CHANGED Viewed

@@ -29,17 +29,21 @@ module Treat::Processors::Segmenters::Tactful
     entity.check_hasnt_children
     s = entity.to_s
     Treat::Helpers::DecimalPointEscaper.escape!(s)
-    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
+    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
     @@segmenter ||= TactfulTokenizer::Model.new
     sentences = @@segmenter.tokenize_text(s)
     sentences.each do |sentence|
       Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
+      puts sentence.to_s if sentence.to_s.include?('staff')
       entity << Treat::Entities::Phrase.from_string(sentence)
     end
   end
 end

data/lib/treat/processors/tokenizers/ptb.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# encoding: utf-8
 # A native rule-basd tokenizer based on the one
 # developped by Robert Macyntyre in 1995 for the Penn
 # Treebank project. This tokenizer follows the
@@ -11,8 +12,6 @@
 # you can redistribute it and/or modify it under the
 # same terms as Ruby itself.
 module Treat::Processors::Tokenizers::PTB
-  require 'treat/helpers/decimal_point_escaper'
   # Tokenize the entity using a native rule-based algorithm.
   def self.tokenize(entity, options = {})
@@ -33,8 +32,17 @@ module Treat::Processors::Tokenizers::PTB
   # Helper method to split the string into tokens.
   def self.split(string)
     s = " " + string + " "
-    Treat::Helpers::DecimalPointEscaper.escape!(s)
+    # Translate some common extended ascii
+    # characters to quotes
+    s.gsub!(/‘/,'`')
+    s.gsub!(/’/,"'")
+    s.gsub!(/“/,"``")
+    s.gsub!(/”/,"''")
     s.gsub!(/\s+/," ")
     s.gsub!(/(\s+)''/,'\1"')
     s.gsub!(/(\s+)``/,'\1"')

data/lib/treat/processors/tokenizers/punkt.rb CHANGED Viewed

@@ -14,8 +14,6 @@
 # Project website: https://github.com/lfcipriani/punkt-segmenter
 class Treat::Processors::Tokenizers::Punkt
-  require 'treat/helpers/decimal_point_escaper'
   SentEndChars = ['.', '?', '!']
   ReSentEndChars = /[.?!]/
   InternalPunctuation = [',', ':', ';']
@@ -35,7 +33,6 @@ class Treat::Processors::Tokenizers::Punkt
     entity.check_hasnt_children
     s = entity.to_s
-    Treat::Helpers::DecimalPointEscaper.escape!(s)
     s.scan(ReWordTokenizer).each do |token|
       if SentEndChars.include?(token[-1])

data/lib/treat/processors/tokenizers/tactful.rb CHANGED Viewed

@@ -51,6 +51,7 @@ class Treat::Processors::Tokenizers::Tactful
     entity.check_hasnt_children
     s = entity.to_s
     Treat::Helpers::DecimalPointEscaper.escape!(s)
     ReTokenize.each do |rules|
@@ -58,6 +59,8 @@ class Treat::Processors::Tokenizers::Tactful
     end
     s.split(' ').each do |token|
+      Treat::Helpers::DecimalPointEscaper.unescape!(token)
       entity << Treat::Entities::Token.
       from_string(token)
     end

data/lib/treat/universalisation/encodings.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Treat::Universalisation
+  Encodings = {
+    :arabic => 'UTF-8',
+    :chinese => 'GB18030',
+    :english => 'UTF-8',
+    :french => 'ISO_8859-1',
+    :german => 'ISO_8859-1',
+    :hebrew => 'UTF-8'
+  }
+end

data/lib/treat/{linguistics → universalisation}/tags.rb RENAMED Viewed

@@ -1,14 +1,20 @@
-module Treat::Linguistics::Tags
+module Treat::Universalisation::Tags
   ClawsC5 = 0
   Brown = 1
   Penn = 2
-  Negra = 3
+  Stuttgart = 3
   PennChinese = 4
-  Simple = 5
+  Paris7 = 5
+  StanfordTagSetForLanguage = {
+    :french => :paris7,
+    :english => :penn,
+    :german => :stuttgart
+  }
   PTBClauseTagDescription = [
-    ['S', 'Simple declarative clause'],
+    ['S', 'Paris7 declarative clause'],
     ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
     ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
     ['SINV', 'Inverted declarative sentence'],
@@ -26,30 +32,33 @@ module Treat::Linguistics::Tags
   AlignedPhraseTags =
   [
-    'Adjective phrase', ['', '', 'ADJP'],
-    'Adverb phrase', ['', '', 'ADVP'],
-    'Conjunction phrase', ['', '', 'CONJP'],
-    'Fragment', ['', '', 'FRAG'],
-    'Interjection', ['', '', 'INTJ'],
-    'List marker', ['', '', 'LST'],
-    'Not a phrase', ['', '', 'NAC'],
-    'Noun phrase', ['', '', 'NP'],
-    'Head of NP', ['', '', 'NX'],
-    'Prepositional phrase', ['', '', 'PP'],
-    'Parenthetical', ['', '', 'PRN'],
-    'Particle', ['', '', 'PRT'],
-    'Quantifier phrase', ['', '', 'QP'],
-    'Reduced relative clause', ['', '', 'RRC'],
-    'Unlike coordinated phrase', ['', '', 'UCP'],
-    'Verb phrase', ['', '', 'VP'],
-    'Wh adjective phrase', ['', '', 'WHADJP'],
-    'Wh adverb phrase', ['', '', 'WHAVP'],
-    'Wh noun phrase', ['', '', 'WHNP'],
-    'Wh prepositional phrase', ['', '', 'WHPP'],
-    'Unknown', ['', '', 'X'],
-    'Phrase', ['', '', 'P'],
-    'Sentence', ['', '', 'S'],
-    'Phrase', ['', '', 'SBAR'] # Fix
+    'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
+    'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
+    'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
+    'Fragment', ['', '', 'FRAG', '', '', ''],
+    'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
+    'List marker', ['', '', 'LST', '', '', ''],
+    'Not a phrase', ['', '', 'NAC', '', '', ''],
+    'Noun phrase', ['', '', 'NP', '', '', 'NP'],
+    'Verbal nucleus',  ['', '', '', '', '', 'VN'],
+    'Head of noun phrase', ['', '', 'NX', '', '', ''],
+    'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
+    'Parenthetical', ['', '', 'PRN', '', '', ''],
+    'Particle', ['', '', 'PRT', '', '', ''],
+    'Participial phrase', ['', '', '', '', '', 'VPart'],
+    'Quantifier phrase', ['', '', 'QP', '', '', ''],
+    'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
+    'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
+    'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
+    'Verb phrase', ['', '', 'VP', '', '', ''],
+    'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
+    'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
+    'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
+    'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
+    'Unknown', ['', '', 'X', '', '', ''],
+    'Phrase', ['', '', 'P', '', '', 'Sint'],
+    'Sentence', ['', '', 'S', '', '', 'SENT'],
+    'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
   ]
   # A description of Enju categories.
@@ -139,12 +148,12 @@ module Treat::Linguistics::Tags
   # JRS?
-  SimpleWordTagToCategory = {
+  Paris7WordTagToCategory = {
     'C' => :complementizer,
     'PN' => :punctuation,
     'SC' => :conjunction
   }
   PunctuationToCategory = {
     '.' => :period,
     ',' => :comma,
@@ -152,9 +161,8 @@ module Treat::Linguistics::Tags
     ':' => :colon,
     '!' => :exclamation,
     '?' => :interrogation,
-    '"' => :quote,
-    "'" => :quote,
+    '"' => :double_quote,
+    "'" => :single_quote,
     '$' => :dollar,
     '%' => :percent,
     '#' => :hash,
@@ -227,7 +235,7 @@ module Treat::Linguistics::Tags
     'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
     'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
     'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
+    'Interjection', ['', '', '', '', '', 'I'],
     'Localizer', ['', '', '', '', 'LC'],
     'Measure word', ['', '', '', '', 'M'],
@@ -366,11 +374,25 @@ module Treat::Linguistics::Tags
     'Verb, ? as main verb', ['', '', '', '', 'VE'],                  # ?
     'Verb, ????', ['', '', '', '', 'VC']                             # ?
   ]
-  wttc = {
-  }
-  Treat::Linguistics::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
+  # Paris7 Treebank functional tags
+=begin
+SUJ (subject)
+OBJ (direct object)
+ATS (predicative complement of a subject)
+ATO (predicative complement of a direct object)
+MOD (modifier or adjunct)
+A-OBJ (indirect complement introduced by à)
+DE-OBJ (indirect complement introduced by de)
+P-OBJ (indirect complement introduced by another preposition)
+=end
+  # !! Extremely ugly code follows.
+  # Generate word tag -> category hash.
+  wttc = {}
+  Treat::Universalisation::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
     category = desc.gsub(',', ' ,').
     split(' ')[0].downcase.intern
@@ -378,32 +400,41 @@ module Treat::Linguistics::Tags
     wttc[tags[ClawsC5]] ||= {}
     wttc[tags[Brown]] ||= {}
     wttc[tags[Penn]] ||= {}
-    wttc[tags[Negra]] ||= {}
+    wttc[tags[Stuttgart]] ||= {}
     wttc[tags[PennChinese]] ||= {}
-    wttc[tags[Simple]] ||= {}
+    wttc[tags[Paris7]] ||= {}
     wttc[tags[ClawsC5]][:claws_5] = category
     wttc[tags[Brown]][:brown] = category
     wttc[tags[Penn]][:penn] = category
-    wttc[tags[Negra]][:negra] = category if tags[Negra]
+    wttc[tags[Stuttgart]][:stuttgart] = category if tags[Stuttgart]
     wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
-    wttc[tags[Simple]][:simple] = category if tags[Simple]
+    wttc[tags[Paris7]][:paris7] = category if tags[Paris7]
   end
   # A hash converting word tags to word categories.
   WordTagToCategory = wttc
   # A hash converting phrase tag to categories.
   pttc = {}
-  Treat::Linguistics::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
+  Treat::Universalisation::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
     category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
     pttc[tags[Penn]] ||= {};
+    pttc[tags[Paris7]] ||= {};
+    pttc[tags[Penn]][:penn] = category
+    pttc[tags[Paris7]][:paris7] = category
     # Not yet for other tag sts.
     #pttc[tags[0]][:claws_5] = category
     #pttc[tags[1]][:brown] = category
-    pttc[tags[Penn]][:penn] = category
   end
   # A hash converting word tags to word categories.
   PhraseTagToCategory = pttc

data/lib/treat/universalisation.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Treat::Universalisation
+  p = 'treat/universalisation/*.rb'
+  Dir[Treat.lib + p].each do |f|
+    require f
+  end
+end

data/lib/treat.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Treat
   end
   # The current version of Treat.
-  VERSION = "1.0.4"
+  VERSION = "1.0.5"
   # Add methods to handle syntactic sugar,
   # language configuration options, and paths.
@@ -44,7 +44,7 @@ module Treat
   require 'treat/kernel'
   require 'treat/downloader'
   require 'treat/languages'
-  require 'treat/linguistics'
+  require 'treat/universalisation'
   require 'treat/entities'
   require 'treat/categories'
   require 'treat/data_set'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: treat
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.0.5
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-20 00:00:00.000000000 Z
+date: 2012-05-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rubyzip
@@ -161,6 +161,7 @@ files:
 - lib/treat/languages/german.rb
 - lib/treat/languages/greek.rb
 - lib/treat/languages/italian.rb
+- lib/treat/languages/language.rb
 - lib/treat/languages/list.txt
 - lib/treat/languages/polish.rb
 - lib/treat/languages/portuguese.rb
@@ -176,9 +177,6 @@ files:
 - lib/treat/lexicalizers/taggers/lingua.rb
 - lib/treat/lexicalizers/taggers/stanford.rb
 - lib/treat/lexicalizers.rb
-- lib/treat/linguistics/categories.rb
-- lib/treat/linguistics/tags.rb
-- lib/treat/linguistics.rb
 - lib/treat/loaders/linguistics.rb
 - lib/treat/loaders/stanford.rb
 - lib/treat/object.rb
@@ -190,7 +188,6 @@ files:
 - lib/treat/processors/segmenters/punkt.rb
 - lib/treat/processors/segmenters/stanford.rb
 - lib/treat/processors/segmenters/tactful.rb
-- lib/treat/processors/tokenizers/perl.rb
 - lib/treat/processors/tokenizers/ptb.rb
 - lib/treat/processors/tokenizers/punkt.rb
 - lib/treat/processors/tokenizers/stanford.rb
@@ -202,6 +199,9 @@ files:
 - lib/treat/retrievers.rb
 - lib/treat/server.rb
 - lib/treat/tree.rb
+- lib/treat/universalisation/encodings.rb
+- lib/treat/universalisation/tags.rb
+- lib/treat/universalisation.rb
 - lib/treat.rb
 - spec/collection.rb
 - spec/document.rb

data/lib/treat/linguistics.rb DELETED Viewed

@@ -1,9 +0,0 @@
-module Treat::Linguistics
-  p = 'treat/linguistics/*.rb'
-  Dir[Treat.lib + p].each do |f|
-    require f
-  end
-end

data/lib/treat/processors/tokenizers/perl.rb DELETED Viewed

@@ -1,132 +0,0 @@
-# encoding: utf-8
-#
-# Tokenize the entity using a native rule-based
-# algorithm. This tokenizer is a port from an
-# unknown Perl module, which I have lifted from
-# the 'rbtagger' gem.
-#
-# Author: Todd A. Fisher
-#
-# This code is free to use under the terms of
-# the MIT license.
-#
-# Original project website:
-#
-# https://github.com/taf2/rb-brill-tagger
-module Treat::Processors::Tokenizers::Perl
-  require 'treat/helpers/decimal_point_escaper'
-  # Tokenize the entity using a rule-based algorithm
-  # ported from Perl by Todd A. Fisher.
-  #
-  # Options: none.
-  def self.tokenize(entity, options = {})
-    entity.check_hasnt_children
-    s = entity.to_s
-    tokens = get_tokens(entity.to_s)
-    tokens[1..-1].each do |token|
-      next if token =~ /^\s*$/
-      entity << Treat::Entities::Token.
-      from_string(token)
-    end
-  end
-  # Helper method to perform the tokenization.
-  def self.get_tokens(string)
-    # Normalize all whitespace
-    text = string.gsub(/\s+/,' ')
-    # Replace all decimal points by ^^
-    Treat::Helpers::DecimalPointEscaper.escape!(text)
-=begin
-    # Translate some common extended ascii
-    # characters to quotes
-    text.gsub!(/‘/,'`')
-    text.gsub!(/’/,"'")
-    text.gsub!(/“/,"``")
-    text.gsub!(/”/,"''")
-    # Attempt to get correct directional quotes
-    # s{\"\b} { `` }g;
-    text.gsub!(/\"\b/,' `` ')
-    # s{\b\"} { '' }g;
-    text.gsub!(/\b\"/," '' ")
-    #s{\"(?=\s)} { '' }g;
-    text.gsub!(/\"(?=\s)/," '' ")
-    #s{\"} { `` }g;
-    text.gsub!(/\"(?=\s)/," `` ")
-=end
-    # Isolate ellipses
-    # s{\.\.\.}   { ... }g;
-    text.gsub!(/\.\.\./,' ... ')
-    # Isolate any embedded punctuation chars
-    #   s{([,;:\@\#\$\%&])} { $1 }g;
-    text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
-    # Assume sentence tokenization has been
-    # done first, so split FINAL
-    # periods only.
-    # s/ ([^.]) \.  ([\]\)\}\>\"\']*)
-    # [ \t]* $ /$1 .$2 /gx;
-    text.gsub!(/ ([^.]) \.  ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
-    # however, we may as well split ALL
-    # question marks and exclamation points,
-    # since they shouldn't have the abbrev.
-    # -marker ambiguity problem
-    #s{([?!])} { $1 }g;
-    text.gsub!(/([?!])/, ' \1 ')
-    # parentheses, brackets, etc.
-    #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
-    text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
-    #s/(-{2,})/ $1 /g;
-    text.gsub!(/(-{2,})/,' \1 ')
-    # Add a space to the beginning and end of
-    # each line, to reduce # of regexps below.
-    #s/$/ /;
-    text.gsub!(/$/," ")
-    #s/^/ /;
-    text.gsub!(/^/," ")
-    # possessive or close-single-quote
-    #s/\([^\']\)\' /$1 \' /g;
-    text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
-    # as in it's, I'm, we'd
-    #s/\'([smd]) / \'$1 /ig;
-    text.gsub!(/\'([smd]) /i,%q( '\1 ))
-    #s/\'(ll|re|ve) / \'$1 /ig;
-    text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
-    #s/n\'t / n\'t /ig;
-    text.gsub!(/n\'t /i,"  n't ")
-    #s/ (can)(not) / $1 $2 /ig;
-    text.gsub!(/ (can)(not) /i,' \1 \2 ')
-    #s/ (d\')(ye) / $1 $2 /ig;
-    text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
-    #s/ (gim)(me) / $1 $2 /ig;
-    text.gsub!(/ (gim)(me) /i,' \1 \2 ')
-    #s/ (gon)(na) / $1 $2 /ig;
-    text.gsub!(/ (gon)(na) /i,' \1 \2 ')
-    #s/ (got)(ta) / $1 $2 /ig;
-    text.gsub!(/ (got)(ta) /i,' \1 \2 ')
-    #s/ (lem)(me) / $1 $2 /ig;
-    text.gsub!(/ (lem)(me) /i,' \1 \2 ')
-    #s/ (more)(\'n) / $1 $2 /ig;
-    text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
-    #s/ (\'t)(is|was) / $1 $2 /ig;
-    text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
-    #s/ (wan)(na) / $1 $2 /ig;
-    text.gsub!(/ (wan)(na) /i,' \1 \2 ')
-    text.split(/\s/)
-  end
-end