RubyGems - treat - Versions diffs - 1.0.4 → 1.0.5 - Mend

treat 1.0.4 → 1.0.5

Files changed (44) hide show

data/LICENSE +0 -1
data/files/INFO +1 -1
data/lib/treat/entities/abilities/buildable.rb +2 -6
data/lib/treat/entities/abilities/checkable.rb +2 -2
data/lib/treat/entities/abilities/delegatable.rb +2 -2
data/lib/treat/entities/abilities/doable.rb +6 -1
data/lib/treat/entities/abilities/iterable.rb +8 -0
data/lib/treat/entities/abilities/magical.rb +1 -1
data/lib/treat/extractors.rb +1 -1
data/lib/treat/formatters/visualizers/standoff.rb +1 -1
data/lib/treat/groupable.rb +4 -0
data/lib/treat/installer.rb +33 -13
data/lib/treat/kernel.rb +0 -4
data/lib/treat/languages/arabic.rb +1 -1
data/lib/treat/languages/chinese.rb +1 -1
data/lib/treat/languages/dutch.rb +1 -1
data/lib/treat/languages/english.rb +1 -1
data/lib/treat/languages/french.rb +4 -4
data/lib/treat/languages/german.rb +3 -3
data/lib/treat/languages/italian.rb +1 -1
data/lib/treat/{linguistics/categories.rb → languages/language.rb} +3 -4
data/lib/treat/languages/polish.rb +1 -1
data/lib/treat/languages/portuguese.rb +1 -1
data/lib/treat/languages/russian.rb +1 -1
data/lib/treat/languages/spanish.rb +1 -1
data/lib/treat/languages/swedish.rb +1 -1
data/lib/treat/lexicalizers/categorizers/from_tag.rb +14 -10
data/lib/treat/lexicalizers/taggers/brill.rb +1 -1
data/lib/treat/lexicalizers/taggers/stanford.rb +5 -2
data/lib/treat/lexicalizers.rb +2 -1
data/lib/treat/processors/parsers/enju.rb +2 -2
data/lib/treat/processors/parsers/stanford.rb +17 -11
data/lib/treat/processors/segmenters/punkt.rb +5 -2
data/lib/treat/processors/segmenters/tactful.rb +5 -1
data/lib/treat/processors/tokenizers/ptb.rb +11 -3
data/lib/treat/processors/tokenizers/punkt.rb +0 -3
data/lib/treat/processors/tokenizers/tactful.rb +3 -0
data/lib/treat/universalisation/encodings.rb +12 -0
data/lib/treat/{linguistics → universalisation}/tags.rb +77 -46
data/lib/treat/universalisation.rb +9 -0
data/lib/treat.rb +2 -2
metadata +6 -6
data/lib/treat/linguistics.rb +0 -9
data/lib/treat/processors/tokenizers/perl.rb +0 -132

data/LICENSE CHANGED Viewed

@@ -20,7 +20,6 @@ Non-trivial amount of code has been incorporated and modified from other librari
 - formatters/readers/odt.rb - Mark Watson (GPL license)
 - processors/tokenizers/macintyre.rb - Utiyama Masao (Ruby License)
 - processors/tokenizers/tactful.rb - Matthew Bunday (GPL license)
-- processors/tokenizers/perl.rb - Todd A. Fisher (MIT License)
 - processors/tokenizers/punkt.rb - Steven Bird Edward Loper and Joel Nothman (Apache 2.0 license)
 - extractors/topics/reuters.rb - Mark Watson (GPL license)
 - inflectors/declensions/english.rb - Thomas Sawyer (MIT license)

data/files/INFO CHANGED Viewed

	@@ -1 +1 @@
1	- This is a folder containing the files downloaded by Treat.
1	+ This is a folder containing the files downloaded by Treat from the internet.

data/lib/treat/entities/abilities/buildable.rb CHANGED Viewed

@@ -4,12 +4,11 @@
 # is pretty much self-explanatory.
 module Treat::Entities::Abilities::Buildable
-  require 'treat/helpers/decimal_point_escaper'
   require 'fileutils'
   # Simple regexps to match common entities.
   WordRegexp = /^[[:alpha:]\-']+$/
-  NumberRegexp = /^#?([0-9]+)(\^\^[0-9]+)?$/
+  NumberRegexp = /^#?([0-9]+)(\.[0-9]+)?$/
   PunctRegexp = /^[[:punct:]\$]+$/
   UriRegexp = /^(http|https):\/\/[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(([0-9]{1,5})?\/.*)?$/ix
   EmailRegexp = /.+\@.+\..+/
@@ -57,8 +56,6 @@ module Treat::Entities::Abilities::Buildable
   # instead of from_string directly).
   def from_string(string, enforce_type = false)
-    Treat::Helpers::DecimalPointEscaper.escape!(string)
     enforce_type = true if caller_method == :build
     unless self == Treat::Entities::Entity
@@ -74,6 +71,7 @@ module Treat::Entities::Abilities::Buildable
     end
     e
   end
   # Build a document from an URL.
@@ -116,7 +114,6 @@ module Treat::Entities::Abilities::Buildable
       "a numeric object."
     end
     n = numeric.to_s
-    Treat::Helpers::DecimalPointEscaper.unescape!(n)
     Treat::Entities::Number.new(n)
   end
@@ -319,7 +316,6 @@ module Treat::Entities::Abilities::Buildable
   end
   def create_collection(fv)
-    debug("Creating new collection in directory #{fv}.")
     FileUtils.mkdir(fv)
     Treat::Entities::Collection.new(fv)
   end

data/lib/treat/entities/abilities/checkable.rb CHANGED Viewed

@@ -24,8 +24,8 @@ module Treat::Entities::Abilities::Checkable
     return unless has_children?
     raise Treat::Exception,
     "Warning: can't #{caller_method(2)} "+
-    "an entity that has children. Removing " +
-    " all children of text \"[#{short_value}].\""
+    "the text \"#{short_value}\", because it " +
+    "already has children."
   end
 end

data/lib/treat/entities/abilities/delegatable.rb CHANGED Viewed

@@ -104,9 +104,9 @@ module Treat::Entities::Abilities::Delegatable
     if !klass[g] || !klass[g][0]
       d = ucc(cl(group))
       d.gsub!('_', ' ')
-      d = 'worker to find "' + d
+      d = d[0..-2]
       raise Treat::Exception, "No #{d}" +
-      "\" is available for the " +
+      " is available for the " +
       "#{lang.to_s.capitalize} language."
     end
     return klass[g][0]

data/lib/treat/entities/abilities/doable.rb CHANGED Viewed

@@ -37,8 +37,13 @@ module Treat::Entities::Abilities::Doable
     end
     if f || entity_types.include?(:entity)
       send(task, worker, options)
+      if group.recursive
+        each do |entity|
+          entity.do_task(task, worker, options, group)
+        end
+      end
     else
-      each_entity(*entity_types) do |entity|
+      each do |entity|
         entity.do_task(task, worker, options, group)
       end
       unless entity_types.include?(type)

data/lib/treat/entities/abilities/iterable.rb CHANGED Viewed

@@ -95,6 +95,14 @@ module Treat::Entities::Abilities::Iterable
     as
   end
+  # Returns the first ancestor that has a feature
+  # with the given name, otherwise nil.
+  def ancestor_with_feature(type, feature)
+    each_ancestor(type) do |ancestor|
+      return ancestor if ancestor.has?(feature)
+    end
+  end
   alias :ancestors_with_type :ancestors_with_types
   # Number of children that have a given feature.

data/lib/treat/entities/abilities/magical.rb CHANGED Viewed

@@ -25,7 +25,7 @@ module Treat::Entities::Abilities::Magical
   def magic(sym, *args)
     @@entities_regexp ||= "(#{Treat::Entities.list.join('|')})"
-    @@cats_regexp ||= "(#{Treat::Linguistics::WordCategories.join('|')})"
+    @@cats_regexp ||= "(#{Treat::Languages::Language::WordCategories.join('|')})"
     method = sym.to_s =~ /entities/ ?
     sym.to_s.gsub('entities', 'entitys') :

data/lib/treat/extractors.rb CHANGED Viewed

@@ -27,7 +27,7 @@ module Treat::Extractors
   module Keywords
     extend Treat::Groupable
     self.type = :annotator
-    self.targets = [:document]
+    self.targets = [:document, :section, :zone]
   end
   # Extract clusters of topic words from a collection.

data/lib/treat/formatters/visualizers/standoff.rb CHANGED Viewed

@@ -44,7 +44,7 @@ class Treat::Formatters::Visualizers::Standoff
   end
   def self.ptb_escape(val)
-    Treat::Linguistics::Tags::
+    Treat::Universalisation::Tags::
     PTBEscapeCharacters.each do |char, esc|
       val.gsub!(char, val)
     end

data/lib/treat/groupable.rb CHANGED Viewed

@@ -95,8 +95,12 @@ module Treat::Groupable
         attr_accessor :presets
         # The preset option to use with preset functions.
         attr_accessor :preset_option
+        # Whether to recurse within multiple targets or not.
+        attr_accessor :recursive
       end
+      self.recursive = false
       # Return the method corresponding to the group.
       # This method resolves the name of the method
       # that a group should provide based on the name

data/lib/treat/installer.rb CHANGED Viewed

@@ -82,9 +82,13 @@ module Treat::Installer
       begin
         Gem::Specification.find_by_name('punkt-segmenter')
         title "Downloading model for the Punkt segmenter for the #{l}."
-        download_punkt_models(language)
+        # Need fix
+        download_punkt_models([language.to_s])
       rescue Gem::LoadError; end
+      # Download reuters models always
+      download_reuters_models
       # If stanford is installed, download models.
       begin
         Gem::Specification.find_by_name('stanford-core-nlp')
@@ -92,7 +96,10 @@ module Treat::Installer
         "model files for the the #{l}.\n\n"
         package = (language == :english) ? :english : :all
         download_stanford(package)
-      rescue Gem::LoadError; end
+      rescue Gem::LoadError
+        puts 'Stanford-core-nlp gem not installed.'
+        puts 'Skipping download of Stanford models.'
+      end
       title "Install external binary libraries " +
             "(requires port, apt-get or win-get).\n"
@@ -124,7 +131,7 @@ module Treat::Installer
     install_dependencies(false)
     install_language_dependencies(dep, false)
     download_stanford(:minimal)
-    download_punkt_models(:english)
+    download_punkt_models([:english])
   end
   def self.install_dependencies(optionally)
@@ -166,7 +173,7 @@ module Treat::Installer
     unless man
       puts "Skipping installation of the "+
       "following binaries:\n\n"
-      Binaries.each do |binary, purpose|
+      Binary.each do |binary, purpose|
         puts "- #{binary} to #{purpose}"
       end
       return
@@ -227,22 +234,35 @@ module Treat::Installer
   end
-  def self.download_punkt_models(language)
+  def self.download_punkt_models(languages)
+    languages.map! { |l| "#{l}.yaml" }
+    download_models 'punkt', languages
+  end
+  def self.download_reuters_models
+    files = ["industry.xml", "region.xml", "topics.xml"]
+    download_models 'reuters', files
+  end
-    f = "#{language}.yaml"
-    dest = "#{Treat.models}punkt/"
+  def self.download_models(directory, files)
-    loc = Treat::Downloader.download(
-    'http', Server, 'treat/punkt', f, Treat.tmp)
+    dest = "#{Treat.models}#{directory}/"
     unless File.readable?(dest)
-      puts "- Creating directory models/punkt ..."
+      puts "- Creating directory models/#{directory} ..."
       FileUtils.mkdir_p(File.absolute_path(dest))
     end
-    puts "- Copying model file to models/punkt ..."
-    FileUtils.cp(loc, File.join(Paths[:models], 'punkt', f))
+    files.each do |file|
+      puts "- Downloading #{file} ..."
+      loc = Treat::Downloader.download(
+      'http', Server, "treat/#{directory}", file, Treat.tmp)
+      puts "- Copying file to models/#{directory} ..."
+      FileUtils.cp(loc, File.join(Paths[:models], directory, file))
+    end
     puts "- Cleaning up..."
     FileUtils.rm_rf(Paths[:tmp] + Server)

data/lib/treat/kernel.rb CHANGED Viewed

@@ -181,10 +181,6 @@ module Kernel
     NULL_DEVICE = '/dev/null'
   end
-  def debug(msg)
-    puts msg if Treat.debug
-  end
   def prompt(msg, valid_answers)
     msg = msg

data/lib/treat/languages/arabic.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class Treat::Languages::Arabic
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford]
+    :taggers => [:stanford]
   }
   Processors = {
     :parsers => [:stanford]

data/lib/treat/languages/chinese.rb CHANGED Viewed

@@ -6,7 +6,7 @@ class Treat::Languages::Chinese
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford]
+    :taggers => [:stanford]
   }
   Processors = {
     :parsers => [:stanford]

data/lib/treat/languages/dutch.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Dutch
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/english.rb CHANGED Viewed

@@ -31,7 +31,7 @@ class Treat::Languages::English
     :chunkers => [:txt],
     :parsers => [:stanford, :enju],
     :segmenters => [:tactful, :punkt, :stanford],
-    :tokenizers => [:perl, :ptb, :stanford, :tactful, :punkt]
+    :tokenizers => [:ptb, :stanford, :tactful, :punkt]
   }
   Retrievers = {

data/lib/treat/languages/french.rb CHANGED Viewed

@@ -6,14 +6,14 @@ class Treat::Languages::French
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford],
-    :category => [:from_tag]
+    :taggers => [:stanford],
+    :categorizers => [:from_tag]
   }
   Processors = {
     :chunkers => [:txt],
     :parsers => [:stanford],
-    :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :segmenters => [:tactful],
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/german.rb CHANGED Viewed

@@ -6,14 +6,14 @@ class Treat::Languages::German
   Extractors = {}
   Inflectors = {}
   Lexicalizers = {
-    :tag => [:stanford],
-    :category => [:from_tag]
+    :taggers => [:stanford],
+    :categorizers => [:from_tag]
   }
   Processors = {
     :chunkers => [:txt],
     :parsers => [:stanford],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/italian.rb CHANGED Viewed

@@ -10,7 +10,7 @@ class Treat::Languages::Italian
     :chunkers => [:txt],
     :parsers => [:stanford],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/{linguistics/categories.rb → languages/language.rb} RENAMED Viewed

@@ -1,11 +1,10 @@
-module Treat::Linguistics
-  # A list of all possible word categories.
+class Treat::Languages::Language
   WordCategories = [
     :adjective, :adverb, :noun, :verb, :interjection,
     :clitic, :coverb, :conjunction, :determiner, :particle,
     :preposition, :pronoun, :number, :symbol, :punctuation,
     :complementizer
   ]
 end

data/lib/treat/languages/polish.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Polish
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/portuguese.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Portuguese
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/russian.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Russian
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/spanish.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Spanish
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/languages/swedish.rb CHANGED Viewed

@@ -9,7 +9,7 @@ class Treat::Languages::Swedish
   Processors = {
     :chunkers => [:txt],
     :segmenters => [:punkt],
-    :tokenizers => [:perl, :tactful]
+    :tokenizers => [:tactful]
   }
   Retrievers = {}

data/lib/treat/lexicalizers/categorizers/from_tag.rb CHANGED Viewed

@@ -3,17 +3,19 @@
 # from its tag (e.g. 'S', 'NP', 'VBZ', 'ADV', etc.).
 class Treat::Lexicalizers::Categorizers::FromTag
-  Pttc = Treat::Linguistics::Tags::PhraseTagToCategory
-  Wttc = Treat::Linguistics::Tags::WordTagToCategory
-  Ptc = Treat::Linguistics::Tags::PunctuationToCategory
+  Pttc = Treat::Universalisation::Tags::PhraseTagToCategory
+  Wttc = Treat::Universalisation::Tags::WordTagToCategory
+  Ptc = Treat::Universalisation::Tags::PunctuationToCategory
   # Find the category of the entity from its tag.
   def self.category(entity, options = {})
     tag = entity.check_has(:tag)
     return :unknown if tag.nil? || tag == '' || entity.type == :symbol
     return :sentence if tag == 'S' || entity.type == :sentence
     return :number if entity.type == :number
     return Ptc[entity.to_s] if entity.type == :punctuation
     if entity.is_a?(Treat::Entities::Phrase)
@@ -29,15 +31,17 @@ class Treat::Lexicalizers::Categorizers::FromTag
     if entity.has?(:tag_set)
       ts = entity.get(:tag_set)
-    elsif entity.parent_phrase &&
-      entity.parent_phrase.has?(:tag_set)
-      ts = entity.parent_phrase.get(:tag_set)
     else
-      raise Treat::Exception,
-      "No information can be found regarding "+
-      "which tag set to use."
+      a = entity.ancestor_with_feature(:phrase, :tag_set)
+      if a
+        ts = a.get(:tag_set)
+      else
+        raise Treat::Exception,
+        "No information can be found regarding "+
+        "which tag set to use."
+      end
     end
     if cat[ts]
       return cat[ts]
     else

data/lib/treat/lexicalizers/taggers/brill.rb CHANGED Viewed

@@ -35,7 +35,7 @@ module Treat::Lexicalizers::Taggers::Brill
     # Tokenize the sentence/phrase.
     if !entity.has_children? &&
       !entity.is_a?(Treat::Entities::Token)
-      entity.tokenize(:perl, options)
+      entity.tokenize(options)
     end
     # Create the tagger if necessary

data/lib/treat/lexicalizers/taggers/stanford.rb CHANGED Viewed

@@ -38,11 +38,14 @@ class Treat::Lexicalizers::Taggers::Stanford
     end
     # Handle tags for sentences and phrases.
     if entity.is_a?(Treat::Entities::Sentence) ||
       (entity.is_a?(Treat::Entities::Phrase) &&
       !entity.parent_sentence)
-        entity.set :tag_set, :penn
+        tag_set =  Treat::Universalisation::Tags::
+                    StanfordTagSetForLanguage[
+                   Treat::Languages.describe(lang)]
+        entity.set :tag_set, tag_set
     end
     if entity.is_a?(Treat::Entities::Sentence)

data/lib/treat/lexicalizers.rb CHANGED Viewed

@@ -16,7 +16,8 @@ module Treat::Lexicalizers
   module Categorizers
     extend Treat::Groupable
     self.type = :annotator
-    self.targets = [:token]
+    self.targets = [:sentence, :phrase, :token]
+    self.recursive = true
     self.default = :from_tag
   end

data/lib/treat/processors/parsers/enju.rb CHANGED Viewed

@@ -23,10 +23,10 @@ module Treat::Processors::Parsers::Enju
   @@parser = nil
   # A hash of Enju cat tags mapped to word categories.
-  Ectc = Treat::Linguistics::Tags::EnjuCatToCategory
+  Ectc = Treat::Universalisation::Tags::EnjuCatToCategory
   # A hash of Enju cat/xcat pairs mapped to PTB tags.
-  Ecxtp = Treat::Linguistics::Tags::EnjuCatXcatToPTB
+  Ecxtp = Treat::Universalisation::Tags::EnjuCatXcatToPTB
   # Parse the entity into its syntactical
   # phrases using Enju.

data/lib/treat/processors/parsers/stanford.rb CHANGED Viewed

@@ -27,6 +27,10 @@ class Treat::Processors::Parsers::Stanford
     lang = entity.language
     init(lang, options)
+    tag_set =  Treat::Universalisation::Tags::
+               StanfordTagSetForLanguage[
+               Treat::Languages.describe(lang)]
     text = ::StanfordCoreNLP::Text.new(val)
     @@parsers[lang].annotate(text)
@@ -37,17 +41,18 @@ class Treat::Processors::Parsers::Stanford
         tag = s.get(:category).to_s
         tag_s, tag_opt = *tag.split('-')
         tag_s ||= 'S'
-        entity.set :tag_set, :penn
         entity.set :tag, tag_s
         entity.set :tag_opt, tag_opt if tag_opt
-        recurse(s.get(:tree).children[0], entity)
-        break
+        recurse(s.get(:tree).children[0], entity, tag_set)
+        break #######
       else
         recurse(s.get(:tree), entity)
       end
     end
+    entity.set :tag_set, tag_set
   end
   def self.init(lang, options)
@@ -76,7 +81,7 @@ class Treat::Processors::Parsers::Stanford
   # Helper method which recurses the tree supplied by
   # the Stanford parser.
-  def self.recurse(java_node, ruby_node, additional_tags = [])
+  def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
     if java_node.num_children == 0
@@ -85,10 +90,8 @@ class Treat::Processors::Parsers::Stanford
       tag_s, tag_opt = *tag.split('-')
       tag_s ||= ''
       ruby_node.value = java_node.value.to_s.strip
-      ruby_node.set :tag_set, :penn
       ruby_node.set :tag, tag_s
       ruby_node.set :tag_opt, tag_opt if tag_opt
-      ruby_node.set :tag_set, :penn
       ruby_node.set :lemma, label.get(:lemma).to_s
       additional_tags.each do |t|
@@ -103,33 +106,35 @@ class Treat::Processors::Parsers::Stanford
       if java_node.num_children == 1 &&
         java_node.children[0].num_children == 0
         recurse(java_node.children[0],
-        ruby_node, additional_tags)
+        ruby_node, tag_set, additional_tags)
         return
       end
       java_node.children.each do |java_child|
         label = java_child.label
         tag = label.get(:category).to_s
         tag_s, tag_opt = *tag.split('-')
         tag_s ||= ''
-        if Treat::Linguistics::Tags::PhraseTagToCategory[tag_s]
+        if Treat::Universalisation::Tags::PhraseTagToCategory[tag_s] &&
+           Treat::Universalisation::Tags::PhraseTagToCategory[tag_s][tag_set]
           ruby_child = Treat::Entities::Phrase.new
         else
           l = java_child.children[0].to_s
           v = java_child.children[0].value.to_s.strip
           # Mhmhmhmhmhm
           val = (l == v) ? v :  l.split(' ')[-1].gsub(')', '')
           ruby_child = Treat::Entities::Token.from_string(val)
         end
-        ruby_child.set :tag_set, :penn
         ruby_child.set :tag, tag_s
         ruby_child.set :tag_opt, tag_opt if tag_opt
         ruby_node << ruby_child
         unless java_child.children.empty?
-          recurse(java_child, ruby_child, additional_tags)
+          recurse(java_child, ruby_child, tag_set, additional_tags)
         end
       end
@@ -137,4 +142,5 @@ class Treat::Processors::Parsers::Stanford
     end
   end
 end

data/lib/treat/processors/segmenters/punkt.rb CHANGED Viewed

@@ -38,15 +38,18 @@ module Treat::Processors::Segmenters::Punkt
     s = entity.to_s
-    # Replace all decimal points by ^^
+    # Replace the point in all floating-point numbers
+    # by ^^; this is a fix since Punkt trips on decimal
+    # numbers.
     Treat::Helpers::DecimalPointEscaper.escape!(s)
-    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
+    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
     result = @@segmenters[lang].
     sentences_from_text(s,
     :output => :sentences_text)
     result.each do |sentence|
+      # Unescape the sentence.
       Treat::Helpers::DecimalPointEscaper.
       unescape!(sentence)
       entity << Treat::Entities::Phrase.

data/lib/treat/processors/segmenters/tactful.rb CHANGED Viewed

@@ -29,17 +29,21 @@ module Treat::Processors::Segmenters::Tactful
     entity.check_hasnt_children
     s = entity.to_s
     Treat::Helpers::DecimalPointEscaper.escape!(s)
-    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s])/) { $1 + ' ' + $2 }
+    s.gsub!(/([^\.\?!]\.|\!|\?)([^\s"'])/) { $1 + ' ' + $2 }
     @@segmenter ||= TactfulTokenizer::Model.new
     sentences = @@segmenter.tokenize_text(s)
     sentences.each do |sentence|
       Treat::Helpers::DecimalPointEscaper.unescape!(sentence)
+      puts sentence.to_s if sentence.to_s.include?('staff')
       entity << Treat::Entities::Phrase.from_string(sentence)
     end
   end
 end

data/lib/treat/processors/tokenizers/ptb.rb CHANGED Viewed

@@ -1,3 +1,4 @@
+# encoding: utf-8
 # A native rule-basd tokenizer based on the one
 # developped by Robert Macyntyre in 1995 for the Penn
 # Treebank project. This tokenizer follows the
@@ -11,8 +12,6 @@
 # you can redistribute it and/or modify it under the
 # same terms as Ruby itself.
 module Treat::Processors::Tokenizers::PTB
-  require 'treat/helpers/decimal_point_escaper'
   # Tokenize the entity using a native rule-based algorithm.
   def self.tokenize(entity, options = {})
@@ -33,8 +32,17 @@ module Treat::Processors::Tokenizers::PTB
   # Helper method to split the string into tokens.
   def self.split(string)
     s = " " + string + " "
-    Treat::Helpers::DecimalPointEscaper.escape!(s)
+    # Translate some common extended ascii
+    # characters to quotes
+    s.gsub!(/‘/,'`')
+    s.gsub!(/’/,"'")
+    s.gsub!(/“/,"``")
+    s.gsub!(/”/,"''")
     s.gsub!(/\s+/," ")
     s.gsub!(/(\s+)''/,'\1"')
     s.gsub!(/(\s+)``/,'\1"')

data/lib/treat/processors/tokenizers/punkt.rb CHANGED Viewed

@@ -14,8 +14,6 @@
 # Project website: https://github.com/lfcipriani/punkt-segmenter
 class Treat::Processors::Tokenizers::Punkt
-  require 'treat/helpers/decimal_point_escaper'
   SentEndChars = ['.', '?', '!']
   ReSentEndChars = /[.?!]/
   InternalPunctuation = [',', ':', ';']
@@ -35,7 +33,6 @@ class Treat::Processors::Tokenizers::Punkt
     entity.check_hasnt_children
     s = entity.to_s
-    Treat::Helpers::DecimalPointEscaper.escape!(s)
     s.scan(ReWordTokenizer).each do |token|
       if SentEndChars.include?(token[-1])

data/lib/treat/processors/tokenizers/tactful.rb CHANGED Viewed

@@ -51,6 +51,7 @@ class Treat::Processors::Tokenizers::Tactful
     entity.check_hasnt_children
     s = entity.to_s
     Treat::Helpers::DecimalPointEscaper.escape!(s)
     ReTokenize.each do |rules|
@@ -58,6 +59,8 @@ class Treat::Processors::Tokenizers::Tactful
     end
     s.split(' ').each do |token|
+      Treat::Helpers::DecimalPointEscaper.unescape!(token)
       entity << Treat::Entities::Token.
       from_string(token)
     end

data/lib/treat/universalisation/encodings.rb ADDED Viewed

@@ -0,0 +1,12 @@
+module Treat::Universalisation
+  Encodings = {
+    :arabic => 'UTF-8',
+    :chinese => 'GB18030',
+    :english => 'UTF-8',
+    :french => 'ISO_8859-1',
+    :german => 'ISO_8859-1',
+    :hebrew => 'UTF-8'
+  }
+end

data/lib/treat/{linguistics → universalisation}/tags.rb RENAMED Viewed

@@ -1,14 +1,20 @@
-module Treat::Linguistics::Tags
+module Treat::Universalisation::Tags
   ClawsC5 = 0
   Brown = 1
   Penn = 2
-  Negra = 3
+  Stuttgart = 3
   PennChinese = 4
-  Simple = 5
+  Paris7 = 5
+  StanfordTagSetForLanguage = {
+    :french => :paris7,
+    :english => :penn,
+    :german => :stuttgart
+  }
   PTBClauseTagDescription = [
-    ['S', 'Simple declarative clause'],
+    ['S', 'Paris7 declarative clause'],
     ['SBAR', 'Clause introduced by a (possibly empty) subordinating conjunction'],
     ['SBARQ', 'Direct question introduced by a wh-word or a wh-phrase'],
     ['SINV', 'Inverted declarative sentence'],
@@ -26,30 +32,33 @@ module Treat::Linguistics::Tags
   AlignedPhraseTags =
   [
-    'Adjective phrase', ['', '', 'ADJP'],
-    'Adverb phrase', ['', '', 'ADVP'],
-    'Conjunction phrase', ['', '', 'CONJP'],
-    'Fragment', ['', '', 'FRAG'],
-    'Interjection', ['', '', 'INTJ'],
-    'List marker', ['', '', 'LST'],
-    'Not a phrase', ['', '', 'NAC'],
-    'Noun phrase', ['', '', 'NP'],
-    'Head of NP', ['', '', 'NX'],
-    'Prepositional phrase', ['', '', 'PP'],
-    'Parenthetical', ['', '', 'PRN'],
-    'Particle', ['', '', 'PRT'],
-    'Quantifier phrase', ['', '', 'QP'],
-    'Reduced relative clause', ['', '', 'RRC'],
-    'Unlike coordinated phrase', ['', '', 'UCP'],
-    'Verb phrase', ['', '', 'VP'],
-    'Wh adjective phrase', ['', '', 'WHADJP'],
-    'Wh adverb phrase', ['', '', 'WHAVP'],
-    'Wh noun phrase', ['', '', 'WHNP'],
-    'Wh prepositional phrase', ['', '', 'WHPP'],
-    'Unknown', ['', '', 'X'],
-    'Phrase', ['', '', 'P'],
-    'Sentence', ['', '', 'S'],
-    'Phrase', ['', '', 'SBAR'] # Fix
+    'Adjectival phrase', ['', '', 'ADJP', '', '', 'AP'],
+    'Adverbial phrase', ['', '', 'ADVP', '', '', 'AdP'],
+    'Conjunction phrase', ['', '', 'CONJP', '', '', 'Ssub'],
+    'Fragment', ['', '', 'FRAG', '', '', ''],
+    'Interjectional phrase', ['', '', 'INTJ', '', '', ''],
+    'List marker', ['', '', 'LST', '', '', ''],
+    'Not a phrase', ['', '', 'NAC', '', '', ''],
+    'Noun phrase', ['', '', 'NP', '', '', 'NP'],
+    'Verbal nucleus',  ['', '', '', '', '', 'VN'],
+    'Head of noun phrase', ['', '', 'NX', '', '', ''],
+    'Prepositional phrase', ['', '', 'PP', '', '', 'PP'],
+    'Parenthetical', ['', '', 'PRN', '', '', ''],
+    'Particle', ['', '', 'PRT', '', '', ''],
+    'Participial phrase', ['', '', '', '', '', 'VPart'],
+    'Quantifier phrase', ['', '', 'QP', '', '', ''],
+    'Relative clause', ['', '', 'RRC', '', '', 'Srel'],
+    'Coordinated phrase', ['', '', 'UCP', '', '', 'COORD'],
+    'Infinitival phrase', ['', '', '', '', '', 'VPinf'],
+    'Verb phrase', ['', '', 'VP', '', '', ''],
+    'Wh adjective phrase', ['', '', 'WHADJP', '', '', ''],
+    'Wh adverb phrase', ['', '', 'WHAVP', '', '', ''],
+    'Wh noun phrase', ['', '', 'WHNP', '', '', ''],
+    'Wh prepositional phrase', ['', '', 'WHPP', '', '', ''],
+    'Unknown', ['', '', 'X', '', '', ''],
+    'Phrase', ['', '', 'P', '', '', 'Sint'],
+    'Sentence', ['', '', 'S', '', '', 'SENT'],
+    'Phrase', ['', '', 'SBAR', '', '', ''] # Fix
   ]
   # A description of Enju categories.
@@ -139,12 +148,12 @@ module Treat::Linguistics::Tags
   # JRS?
-  SimpleWordTagToCategory = {
+  Paris7WordTagToCategory = {
     'C' => :complementizer,
     'PN' => :punctuation,
     'SC' => :conjunction
   }
   PunctuationToCategory = {
     '.' => :period,
     ',' => :comma,
@@ -152,9 +161,8 @@ module Treat::Linguistics::Tags
     ':' => :colon,
     '!' => :exclamation,
     '?' => :interrogation,
-    '"' => :quote,
-    "'" => :quote,
+    '"' => :double_quote,
+    "'" => :single_quote,
     '$' => :dollar,
     '%' => :percent,
     '#' => :hash,
@@ -227,7 +235,7 @@ module Treat::Linguistics::Tags
     'Determiner, possessive, second', ['DPS', 'PP$', 'PRPS', '', '', 'D'],
     'Determiner, question', ['DTQ', 'WDT', 'WDT', '', 'DT', 'D'],
     'Determiner, possessive & question', ['DTQ', 'WP$', 'WP$', '', '', 'D'],
+    'Interjection', ['', '', '', '', '', 'I'],
     'Localizer', ['', '', '', '', 'LC'],
     'Measure word', ['', '', '', '', 'M'],
@@ -366,11 +374,25 @@ module Treat::Linguistics::Tags
     'Verb, ? as main verb', ['', '', '', '', 'VE'],                  # ?
     'Verb, ????', ['', '', '', '', 'VC']                             # ?
   ]
-  wttc = {
-  }
-  Treat::Linguistics::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
+  # Paris7 Treebank functional tags
+=begin
+SUJ (subject)
+OBJ (direct object)
+ATS (predicative complement of a subject)
+ATO (predicative complement of a direct object)
+MOD (modifier or adjunct)
+A-OBJ (indirect complement introduced by à)
+DE-OBJ (indirect complement introduced by de)
+P-OBJ (indirect complement introduced by another preposition)
+=end
+  # !! Extremely ugly code follows.
+  # Generate word tag -> category hash.
+  wttc = {}
+  Treat::Universalisation::Tags::AlignedWordTags.each_slice(2) do |desc, tags|
     category = desc.gsub(',', ' ,').
     split(' ')[0].downcase.intern
@@ -378,32 +400,41 @@ module Treat::Linguistics::Tags
     wttc[tags[ClawsC5]] ||= {}
     wttc[tags[Brown]] ||= {}
     wttc[tags[Penn]] ||= {}
-    wttc[tags[Negra]] ||= {}
+    wttc[tags[Stuttgart]] ||= {}
     wttc[tags[PennChinese]] ||= {}
-    wttc[tags[Simple]] ||= {}
+    wttc[tags[Paris7]] ||= {}
     wttc[tags[ClawsC5]][:claws_5] = category
     wttc[tags[Brown]][:brown] = category
     wttc[tags[Penn]][:penn] = category
-    wttc[tags[Negra]][:negra] = category if tags[Negra]
+    wttc[tags[Stuttgart]][:stuttgart] = category if tags[Stuttgart]
     wttc[tags[PennChinese]][:penn_chinese] = category if tags[PennChinese]
-    wttc[tags[Simple]][:simple] = category if tags[Simple]
+    wttc[tags[Paris7]][:paris7] = category if tags[Paris7]
   end
   # A hash converting word tags to word categories.
   WordTagToCategory = wttc
   # A hash converting phrase tag to categories.
   pttc = {}
-  Treat::Linguistics::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
+  Treat::Universalisation::Tags::AlignedPhraseTags.each_slice(2) do |desc, tags|
     category = desc.gsub(',', ' ,').gsub(' ', '_').downcase.intern
     pttc[tags[Penn]] ||= {};
+    pttc[tags[Paris7]] ||= {};
+    pttc[tags[Penn]][:penn] = category
+    pttc[tags[Paris7]][:paris7] = category
     # Not yet for other tag sts.
     #pttc[tags[0]][:claws_5] = category
     #pttc[tags[1]][:brown] = category
-    pttc[tags[Penn]][:penn] = category
   end
   # A hash converting word tags to word categories.
   PhraseTagToCategory = pttc

data/lib/treat/universalisation.rb ADDED Viewed

@@ -0,0 +1,9 @@
+module Treat::Universalisation
+  p = 'treat/universalisation/*.rb'
+  Dir[Treat.lib + p].each do |f|
+    require f
+  end
+end

data/lib/treat.rb CHANGED Viewed

@@ -10,7 +10,7 @@ module Treat
   end
   # The current version of Treat.
-  VERSION = "1.0.4"
+  VERSION = "1.0.5"
   # Add methods to handle syntactic sugar,
   # language configuration options, and paths.
@@ -44,7 +44,7 @@ module Treat
   require 'treat/kernel'
   require 'treat/downloader'
   require 'treat/languages'
-  require 'treat/linguistics'
+  require 'treat/universalisation'
   require 'treat/entities'
   require 'treat/categories'
   require 'treat/data_set'

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: treat
 version: !ruby/object:Gem::Version
-  version: 1.0.4
+  version: 1.0.5
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-04-20 00:00:00.000000000 Z
+date: 2012-05-17 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rubyzip
@@ -161,6 +161,7 @@ files:
 - lib/treat/languages/german.rb
 - lib/treat/languages/greek.rb
 - lib/treat/languages/italian.rb
+- lib/treat/languages/language.rb
 - lib/treat/languages/list.txt
 - lib/treat/languages/polish.rb
 - lib/treat/languages/portuguese.rb
@@ -176,9 +177,6 @@ files:
 - lib/treat/lexicalizers/taggers/lingua.rb
 - lib/treat/lexicalizers/taggers/stanford.rb
 - lib/treat/lexicalizers.rb
-- lib/treat/linguistics/categories.rb
-- lib/treat/linguistics/tags.rb
-- lib/treat/linguistics.rb
 - lib/treat/loaders/linguistics.rb
 - lib/treat/loaders/stanford.rb
 - lib/treat/object.rb
@@ -190,7 +188,6 @@ files:
 - lib/treat/processors/segmenters/punkt.rb
 - lib/treat/processors/segmenters/stanford.rb
 - lib/treat/processors/segmenters/tactful.rb
-- lib/treat/processors/tokenizers/perl.rb
 - lib/treat/processors/tokenizers/ptb.rb
 - lib/treat/processors/tokenizers/punkt.rb
 - lib/treat/processors/tokenizers/stanford.rb
@@ -202,6 +199,9 @@ files:
 - lib/treat/retrievers.rb
 - lib/treat/server.rb
 - lib/treat/tree.rb
+- lib/treat/universalisation/encodings.rb
+- lib/treat/universalisation/tags.rb
+- lib/treat/universalisation.rb
 - lib/treat.rb
 - spec/collection.rb
 - spec/document.rb

data/lib/treat/linguistics.rb DELETED Viewed

@@ -1,9 +0,0 @@
-module Treat::Linguistics
-  p = 'treat/linguistics/*.rb'
-  Dir[Treat.lib + p].each do |f|
-    require f
-  end
-end

data/lib/treat/processors/tokenizers/perl.rb DELETED Viewed

@@ -1,132 +0,0 @@
-# encoding: utf-8
-#
-# Tokenize the entity using a native rule-based
-# algorithm. This tokenizer is a port from an
-# unknown Perl module, which I have lifted from
-# the 'rbtagger' gem.
-#
-# Author: Todd A. Fisher
-#
-# This code is free to use under the terms of
-# the MIT license.
-#
-# Original project website:
-#
-# https://github.com/taf2/rb-brill-tagger
-module Treat::Processors::Tokenizers::Perl
-  require 'treat/helpers/decimal_point_escaper'
-  # Tokenize the entity using a rule-based algorithm
-  # ported from Perl by Todd A. Fisher.
-  #
-  # Options: none.
-  def self.tokenize(entity, options = {})
-    entity.check_hasnt_children
-    s = entity.to_s
-    tokens = get_tokens(entity.to_s)
-    tokens[1..-1].each do |token|
-      next if token =~ /^\s*$/
-      entity << Treat::Entities::Token.
-      from_string(token)
-    end
-  end
-  # Helper method to perform the tokenization.
-  def self.get_tokens(string)
-    # Normalize all whitespace
-    text = string.gsub(/\s+/,' ')
-    # Replace all decimal points by ^^
-    Treat::Helpers::DecimalPointEscaper.escape!(text)
-=begin
-    # Translate some common extended ascii
-    # characters to quotes
-    text.gsub!(/‘/,'`')
-    text.gsub!(/’/,"'")
-    text.gsub!(/“/,"``")
-    text.gsub!(/”/,"''")
-    # Attempt to get correct directional quotes
-    # s{\"\b} { `` }g;
-    text.gsub!(/\"\b/,' `` ')
-    # s{\b\"} { '' }g;
-    text.gsub!(/\b\"/," '' ")
-    #s{\"(?=\s)} { '' }g;
-    text.gsub!(/\"(?=\s)/," '' ")
-    #s{\"} { `` }g;
-    text.gsub!(/\"(?=\s)/," `` ")
-=end
-    # Isolate ellipses
-    # s{\.\.\.}   { ... }g;
-    text.gsub!(/\.\.\./,' ... ')
-    # Isolate any embedded punctuation chars
-    #   s{([,;:\@\#\$\%&])} { $1 }g;
-    text.gsub!(/([,;:\@\#\$\%&])/, ' \1 ')
-    # Assume sentence tokenization has been
-    # done first, so split FINAL
-    # periods only.
-    # s/ ([^.]) \.  ([\]\)\}\>\"\']*)
-    # [ \t]* $ /$1 .$2 /gx;
-    text.gsub!(/ ([^.]) \.  ([\]\)\}\>\"\']*) [ \t]* $ /x, '\1 .\2 ')
-    # however, we may as well split ALL
-    # question marks and exclamation points,
-    # since they shouldn't have the abbrev.
-    # -marker ambiguity problem
-    #s{([?!])} { $1 }g;
-    text.gsub!(/([?!])/, ' \1 ')
-    # parentheses, brackets, etc.
-    #s{([\]\[\(\)\{\}\<\>])} { $1 }g;
-    text.gsub!(/([\]\[\(\)\{\}\<\>])/,' \1 ')
-    #s/(-{2,})/ $1 /g;
-    text.gsub!(/(-{2,})/,' \1 ')
-    # Add a space to the beginning and end of
-    # each line, to reduce # of regexps below.
-    #s/$/ /;
-    text.gsub!(/$/," ")
-    #s/^/ /;
-    text.gsub!(/^/," ")
-    # possessive or close-single-quote
-    #s/\([^\']\)\' /$1 \' /g;
-    text.gsub!(/\([^\']\)\' /,%q(\1 ' ))
-    # as in it's, I'm, we'd
-    #s/\'([smd]) / \'$1 /ig;
-    text.gsub!(/\'([smd]) /i,%q( '\1 ))
-    #s/\'(ll|re|ve) / \'$1 /ig;
-    text.gsub!(/\'(ll|re|ve) /i,%q( '\1 ))
-    #s/n\'t / n\'t /ig;
-    text.gsub!(/n\'t /i,"  n't ")
-    #s/ (can)(not) / $1 $2 /ig;
-    text.gsub!(/ (can)(not) /i,' \1 \2 ')
-    #s/ (d\')(ye) / $1 $2 /ig;
-    text.gsub!(/ (d\')(ye) /i,' \1 \2 ')
-    #s/ (gim)(me) / $1 $2 /ig;
-    text.gsub!(/ (gim)(me) /i,' \1 \2 ')
-    #s/ (gon)(na) / $1 $2 /ig;
-    text.gsub!(/ (gon)(na) /i,' \1 \2 ')
-    #s/ (got)(ta) / $1 $2 /ig;
-    text.gsub!(/ (got)(ta) /i,' \1 \2 ')
-    #s/ (lem)(me) / $1 $2 /ig;
-    text.gsub!(/ (lem)(me) /i,' \1 \2 ')
-    #s/ (more)(\'n) / $1 $2 /ig;
-    text.gsub!(/ (more)(\'n) /i,' \1 \2 ')
-    #s/ (\'t)(is|was) / $1 $2 /ig;
-    text.gsub!(/ (\'t)(is|was) /i,' \1 \2 ')
-    #s/ (wan)(na) / $1 $2 /ig;
-    text.gsub!(/ (wan)(na) /i,' \1 \2 ')
-    text.split(/\s/)
-  end
-end