RubyGems - treat - Versions diffs - 1.1.0 → 1.1.1 - Mend

treat 1.1.0 → 1.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (42) hide show

data/LICENSE +1 -1
data/README.md +3 -3
data/lib/treat/config.rb +10 -0
data/lib/treat/core/data_set.rb +80 -32
data/lib/treat/core/feature.rb +35 -0
data/lib/treat/core/problem.rb +43 -0
data/lib/treat/core/question.rb +27 -0
data/lib/treat/entities/abilities/buildable.rb +5 -3
data/lib/treat/entities/abilities/exportable.rb +4 -4
data/lib/treat/entities/collection.rb +1 -1
data/lib/treat/entities/document.rb +1 -1
data/lib/treat/entities/group.rb +8 -5
data/lib/treat/entities/section.rb +1 -1
data/lib/treat/entities/token.rb +20 -8
data/lib/treat/entities/zone.rb +6 -5
data/lib/treat/loaders/linguistics.rb +18 -19
data/lib/treat/loaders/stanford.rb +3 -2
data/lib/treat/version.rb +1 -1
data/lib/treat/workers/extractors/language/what_language.rb +53 -57
data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -5
data/lib/treat/workers/formatters/serializers/mongo.rb +33 -27
data/lib/treat/workers/formatters/unserializers/mongo.rb +14 -36
data/lib/treat/workers/learners/classifiers/id3.rb +4 -5
data/lib/treat/workers/learners/classifiers/mlp.rb +1 -1
data/lib/treat/workers.rb +1 -1
data/spec/entity.rb +7 -5
data/spec/phrase.rb +2 -2
data/spec/zone.rb +2 -3
metadata +37 -15
data/bin/stanford/bridge.jar +0 -0
data/bin/stanford/joda-time.jar +0 -0
data/bin/stanford/stanford-corenlp.jar +0 -0
data/bin/stanford/stanford-parser.jar +0 -0
data/bin/stanford/xom.jar +0 -0
data/files/21552208.html +0 -683
data/files/3_2_release_notes.html +0 -766
data/files/nethttp-cheat-sheet-2940.html +0 -395
data/files/weather-central-canada-heat-wave.html +0 -1370
data/lib/treat/core/classification.rb +0 -63
data/lib/treat/core/server.rb +0 -3
data/spec/sandbox.rb +0 -223
data/tmp/english.yaml +0 -10340

data/LICENSE CHANGED Viewed

@@ -1,4 +1,4 @@
-Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.0
+Treat - Text Retrieval, Extraction and Annotation Toolkit, v. 1.1.1
 This program is free software: you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by

data/README.md CHANGED Viewed

@@ -21,9 +21,9 @@ Treat is a framework for natural language processing and computational linguisti
 **Resources**
 * Read the [latest documentation](http://rubydoc.info/github/louismullie/treat/frames).
-* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installing-Treat).
-* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Using-Treat).
-* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing-to-Treat).
+* See how to [install Treat](https://github.com/louismullie/treat/wiki/Installation).
+* Learn how to [use Treat](https://github.com/louismullie/treat/wiki/Manual).
+* Help out by [contributing to the project](https://github.com/louismullie/treat/wiki/Contributing).
 * View a list of [papers](https://github.com/louismullie/treat/wiki/Papers) about tools included in this toolkit.
 * Open an [issue](https://github.com/louismullie/treat/issues).

data/lib/treat/config.rb CHANGED Viewed

@@ -105,6 +105,16 @@ module Treat::Config
         end
       end
     end
+    Treat::Core.constants.each do |kname|
+      Object.class_eval do
+        klass = Treat::Core.const_get(kname)
+        define_method(kname) do |*args|
+          klass.new(*args)
+        end
+      end
+    end
   end
   # Turn off syntactic sugar.

data/lib/treat/core/data_set.rb CHANGED Viewed

@@ -1,48 +1,96 @@
+# A DataSet contains an entity classification
+# problem as well as data for entities that
+# have already been classified, complete with
+# references to these entities.
 class Treat::Core::DataSet
+  # Used to serialize Procs.
+  silence_warnings do
+    require 'sourcify'
+  end
-  attr_reader :classification
-  attr_reader :labels
-  attr_reader :items
-  attr_reader :ids
+  # The classification problem this
+  # data set holds data for.
+  attr_accessor :problem
+  # Items that have been already
+  # classified (training data).
+  attr_accessor :items
+  # References to the IDs of the
+  # original entities contained
+  # in the data set.
+  attr_accessor :entities
-  def self.open(file)
-    unless File.readable?(file)
-      raise Treat::Exception,
-      "Cannot load data set "+
-      "from #{file} because " +
-      "it doesn't exist."
+  # Initialize the DataSet. Can be
+  # done with a Problem entity
+  # (thereby creating an empty set)
+  # or with a filename (representing
+  # a serialized data set which will
+  # then be deserialized and loaded).
+  def initialize(prob_or_file)
+    if prob_or_file.is_a?(String)
+      ds = self.class.
+      unserialize(prob_or_file)
+      @problem = ds.problem
+      @items = ds.items
+      @entities = ds.entities
+    else
+      @problem = prob_or_file
+      @items, @entities = [], []
     end
-    ::Psych.load(
-    File.read(file))
-  end
-  def initialize(classification)
-    @classification = classification
-    @labels = classification.labels
-    @items = []
-    @ids = []
   end
+  # Add an entity to the data set.
+  # The entity's relevant features
+  # are calculated based on the
+  # classification problem, and a
+  # line with the results of the
+  # calculation is added to the
+  # data set, along with the ID
+  # of the entity.
   def <<(entity)
-    @items <<
-    @classification.
+    @items << @problem.
     export_item(entity)
-    @ids << entity.id
+    @entities << entity.id
   end
-  def save(file)
-    File.open(file, 'w') do |f|
-      f.write(::Psych.dump(self))
+  # Marshal the data set to the supplied
+  # file name. Marshal is used for speed;
+  # other serialization options may be
+  # provided in later versions. This
+  # method relies on the sourcify gem
+  # to transform Feature procs to strings,
+  # since procs/lambdas can't be serialized.
+  def serialize(file)
+    problem = @problem.dup
+    problem.features.each do |feature|
+      next unless feature.proc
+      feature.proc = feature.proc.to_source
+    end
+    data = [problem, @items, @entities]
+    File.open(file, 'w') do |f|
+      f.write(Marshal.dump(data))
+    end
+    problem.features.each do |feature|
+      next unless feature.proc
+      source = feature.proc[5..-1]
+      feature.proc = eval("Proc.new #{source}")
     end
   end
-  def to_ai4r
-    Ai4r::Data::DataSet.new(
-      :data_items => items,
-      :data_labels => (
-        labels.map { |l| l.to_s } +
-        [classification.question.to_s]
-    ))
+  # Unserialize a data set file created
+  # by using the #serialize method.
+  def self.unserialize(file)
+    data = Marshal.load(File.read(file))
+    problem, items, entities = *data
+    problem.features.each do |feature|
+      next unless feature.proc
+      source = feature.proc[5..-1]
+      feature.proc = eval("Proc.new #{source}")
+    end
+    data_set = Treat::Core::DataSet.new(problem)
+    data_set.items = items
+    data_set.entities = entities
+    data_set
   end
 end

data/lib/treat/core/feature.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# Represents a feature to be used
+# in a classification task.
+class Treat::Core::Feature
+  # The name of the feature. If no
+  # proc is supplied, this assumes
+  # that the target of your classification
+  # problem responds to the method
+  # corresponding to this name.
+  attr_accessor :name
+  # A proc that can be used to perform
+  # calculations before storing a feature.
+  attr_accessor :proc
+  # The default value to be
+  attr_accessor :default
+  # Initialize a feature for a classification
+  # problem. If two arguments are supplied,
+  # the second argument is assumed to be the
+  # default value. If three arguments are
+  # supplied, the second argument is the
+  # callback to generate the feature, and
+  # the third one is the default value.
+  def initialize(name, proc_or_default = nil, default = nil)
+    @name = name
+    if proc_or_default.is_a?(Proc)
+      @proc, @default =
+      proc_or_default, default
+    else
+      @proc = nil
+      @default = proc_or_default
+    end
+  end
+end

data/lib/treat/core/problem.rb ADDED Viewed

@@ -0,0 +1,43 @@
+# Defines a classification problem.
+# - What question are we trying to answer?
+# - What features are we going to look at
+#   to attempt to answer that question?
+class Treat::Core::Problem
+  # The question we are trying to answer.
+  attr_accessor :question
+  # An array of features that will be
+  # looked at in trying to answer the
+  # problem's question.
+  attr_accessor :features
+  # Just the labels from the features.
+  attr_accessor :labels
+  # Initialize the problem with a question
+  # and an arbitrary number of features.
+  def initialize(question, *features)
+    @question = question
+    @features = features
+    @labels = @features.map { |f| f.name }
+  end
+  # Return an array of all the entity's
+  # features, as defined by the problem.
+  # If include_answer is set to true, will
+  # append the answer to the problem after
+  # all of the features.
+  def export_item(e, include_answer = true)
+    line = []
+    @features.each do |feature|
+      r = feature.proc ?
+      feature.proc.call(e) :
+      e.send(feature.name)
+      line << (r || feature.default)
+    end
+    return line unless include_answer
+    line << (e.has?(@question.name) ?
+    e.get(@question.name) : @question.default)
+    line
+  end
+end

data/lib/treat/core/question.rb ADDED Viewed

@@ -0,0 +1,27 @@
+# Defines a question to answer in the
+# context of a classification problem.
+class Treat::Core::Question
+  # Defines an arbitrary label for the
+  # question we are trying to answer
+  # (e.g. is_key_sentence), which will
+  # also be used as the annotation name
+  # for the answer to the question.
+  attr_accessor :name
+  # Can be :continuous or :discrete,
+  # depending on the features used.
+  attr_accessor :type
+  # Defines the target of the question
+  # (e.g. :sentence, :paragraph, etc.)
+  attr_accessor :target
+  # Default for the answer to the question.
+  attr_accessor :default
+  # Initialize the question.
+  def initialize(name, target,
+    type = :continuous, default = nil)
+    @name, @target = name, target
+    @type, @default = type, default
+  end
+end

data/lib/treat/entities/abilities/buildable.rb CHANGED Viewed

@@ -2,6 +2,7 @@
 # from a folder of files, a specific file,
 # a string or a numeric object. This class
 # is pretty much self-explanatory.
+# FIXME how can we make this language independent?
 module Treat::Entities::Abilities::Buildable
   require 'schiphol'
@@ -162,8 +163,10 @@ module Treat::Entities::Abilities::Buildable
   # Build a document from a raw or serialized file.
   def from_file(file, options)
-    if file.index('yml') || file.index('yaml') || file.index('xml') || file.index('mongo')
+    if file.index('yml') ||
+      file.index('yaml') ||
+      file.index('xml') ||
+      file.index('mongo')
       from_serialized_file(file, options)
     else
       fmt = Treat::Workers::Formatters::Readers::Autoselect.
@@ -221,7 +224,6 @@ module Treat::Entities::Abilities::Buildable
     id = options[:id]
     e = self.new(nil, id)
     e.unserialize(adapter, options)
-    e
   end
   # Build any kind of entity from a string.

data/lib/treat/entities/abilities/exportable.rb CHANGED Viewed

@@ -1,11 +1,11 @@
 module Treat::Entities::Abilities::Exportable
-  def export(classification)
-    ds = Treat::Core::DataSet.new(classification)
-    each_entity(*classification.types) do |e|
+  def export(problem)
+    ds = Treat::Core::DataSet.new(problem)
+    each_entity(problem.question.target) do |e|
       ds << e
     end
     ds
   end
-end
+end

data/lib/treat/entities/collection.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Treat::Entities
   # Represents a collection of texts.
-  class Collection < Treat::Entities::Entity
+  class Collection < Entity
     # Initialize the collection with a folder
     # containing the texts of the collection.

data/lib/treat/entities/document.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Treat::Entities
   # Represents a document.
-  class Document < Treat::Entities::Entity
+  class Document < Entity
     # Initialize a document with a file name.
     def initialize(file = nil, id = nil)
       super('', id)

data/lib/treat/entities/group.rb CHANGED Viewed

@@ -1,15 +1,18 @@
 module Treat::Entities
-  # Any kind of grouped entities.
-  class Group < Treat::Entities::Entity; end
+  # Represents a group of tokens.
+  class Group < Entity; end
-  # Represents a group of words with a sentence ender.
+  # Represents a group of words
+  # with a sentence ender (.!?)
   class Sentence < Group; end
-  # Represents a group of words.
+  # Represents a group of words,
+  # with no sentence ender.
   class Phrase < Group; end
-  # Represents a non-linguistic fragment
+  # Represents a non-linguistic
+  # fragment (e.g. stray symbols).
   class Fragment < Group; end
 end

data/lib/treat/entities/section.rb CHANGED Viewed

@@ -1,6 +1,6 @@
 module Treat::Entities
   # Represents a section.
-  class Section < Treat::Entities::Entity; end
+  class Section < Entity; end
   # Represents a page of text.
   class Page < Section; end

data/lib/treat/entities/token.rb CHANGED Viewed

@@ -1,31 +1,43 @@
 module Treat::Entities
-  # Represents a terminal element in the text structure.
-  class Token < Treat::Entities::Entity; end
+  # Represents a terminal element
+  # (leaf) in the text structure.
+  class Token < Entity; end
-  # Represents a word.
+  # Represents a word.  Strictly,
+  # this is /^[[:alpha:]\-']+$/.
   class Word < Token; end
-  # Represents a clitic ('s).
+  # Represents an enclitic.
+  # Strictly, this is any of
+  # 'll 'm 're 's 't or 've.
   class Enclitic < Token; end
-  # Represents a number.
+  # Represents a number. Strictly,
+  # this is /^#?([0-9]+)(\.[0-9]+)?$/.
   class Number < Token
     def to_i; to_s.to_i; end
     def to_f; to_s.to_f; end
   end
   # Represents a punctuation sign.
+  # Strictly, this is /^[[:punct:]\$]+$/.
   class Punctuation < Token; end
   # Represents a character that is neither
-  # alphabetical, numerical or a punctuation
-  # character (e.g. @#$%&*).
+  # a word, an enclitic, a number or a
+  # punctuation character (e.g. @#$%&*).
   class Symbol < Token; end
-  # Represents a url.
+  # Represents a url. This is (imperfectly)
+  # defined as /^(http|https):\/\/[a-z0-9]
+  # +([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}
+  # (([0-9]{1,5})?\/.*)?$/ix
   class Url < Token; end
   # Represents a valid RFC822 address.
+  # This is (imperfectly) defined as
+  # /.+\@.+\..+/ (fixme maybe?)
   class Email < Token; end
   # Represents a token whose type

data/lib/treat/entities/zone.rb CHANGED Viewed

@@ -1,11 +1,12 @@
 module Treat::Entities
-  # Represents a zone of text
-  # (Title, Paragraph, List, Quote).
-  class Zone < Treat::Entities::Entity; end
+  # Represents a zone of text.
+  class Zone < Entity; end
-  # Represents a title, subtitle, logical header.
+  # Represents a title, subtitle,
+  # logical header of a text.
   class Title < Zone; end
-  # Represents a paragraph.
+  # Represents a paragraph (group
+  # of sentences and/or phrases).
   class Paragraph < Zone; end
 end

data/lib/treat/loaders/linguistics.rb CHANGED Viewed

@@ -2,28 +2,27 @@
 # registered with the Linguistics gem.
 class Treat::Loaders::Linguistics
-  silence_warnings do
-    require 'linguistics'
-  end
+  # Linguistics throws warnings; silence them.
+  silence_warnings { require 'linguistics' }
+  # Linguistics classes for each language.
   @@languages = {}
+  # Load the Linguistics class that corresponds
+  # to the supplied language; raises an exception
+  # if there is no such language class registered.
   def self.load(language)
-    if @@languages[language]
-      return @@languages[language]
+    silence_warnings do
+      @@languages[language] ||=
+      ::Linguistics.const_get(
+      language.to_s[0..1].upcase)
     end
-    begin
-      l = language.to_s[0..1].upcase
-      silence_warnings do
-        @@languages[language] =
-        ::Linguistics.const_get(l)
-      end
-    rescue RuntimeError
-      raise "Ruby Linguistics does " +
-      "not have a module installed " +
-      "for the #{language} language."
-    end
+    return @@languages[language]
+  rescue RuntimeError
+    raise Treat::Exception,
+    "Ruby Linguistics does " +
+    "not have a module installed " +
+    "for the #{language} language."
   end
-end
+end

data/lib/treat/loaders/stanford.rb CHANGED Viewed

@@ -1,10 +1,11 @@
-# A helper class to load the
-# Stanford Core NLP package.
+# A helper class to load the CoreNLP package.
 class Treat::Loaders::Stanford
   require 'stanford-core-nlp'
   @@loaded = false
+  # Load CoreNLP package for a given language.
   def self.load(language = nil)
     return if @@loaded
     language ||= Treat.core.language.default

data/lib/treat/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module Treat
-  VERSION = "1.1.0"
+  VERSION = "1.1.1"
 end

data/lib/treat/workers/extractors/language/what_language.rb CHANGED Viewed

@@ -1,61 +1,57 @@
-module Treat::Workers::Extractors::Language
-  # Adaptor for the 'whatlanguage' gem, which
-  # performs probabilistic language detection.
-  # The library works by checking for the presence
-  # of words with bloom filters built from
-  # dictionaries based upon each source language.
-  class WhatLanguage
-    # Require the 'whatlanguage' gem.
-    silence_warnings { require 'whatlanguage'  }
-    # Undefine the method defined by the gem.
-    String.class_eval { undef :language }
-    # By default, bias towards common languages.
-    DefaultOptions = {
-      :bias_toward => [:english, :french, :chinese, :german, :arabic, :spanish]
-    }
-    # Keep only once instance of the gem class.
-    @@detector = nil
-    # Detect the language of an entity using the
-    # 'whatlanguage' gem. Return an identifier
-    # corresponding to the ISO-639-2 code for the
-    # language.
-    #
-    # Options:
-    #
-    # - (Array of Symbols) bias => Languages to bias
-    # toward when more than one language is detected
-    # with equal probability.
-    def self.language(entity, options = {})
-      options = DefaultOptions.merge(options)
-      @@detector ||= ::WhatLanguage.new(:possibilities)
-      possibilities = @@detector.process_text(entity.to_s)
-      lang = {}
-      possibilities.each do |k,v|
-        lang[k.intern] = v
-      end
-      max = lang.values.max
-      ordered = lang.select { |i,j| j == max }.keys
-      ordered.each do |l|
-        if options[:bias_toward].include?(l)
-          return l
-        end
+# Adaptor for the 'whatlanguage' gem, which
+# performs probabilistic language detection.
+# The library works by checking for the presence
+# of words with bloom filters built from
+# dictionaries based upon each source language.
+module Treat::Workers::Extractors::Language::WhatLanguage
+  # Require the 'whatlanguage' gem.
+  silence_warnings { require 'whatlanguage'  }
+  # Undefine the method defined by the gem.
+  String.class_eval { undef :language }
+  # By default, bias towards common languages.
+  DefaultOptions = {
+    :bias_toward => [:english, :french, :chinese, :german, :arabic, :spanish]
+  }
+  # Keep only once instance of the gem class.
+  @@detector = nil
+  # Detect the language of an entity using the
+  # 'whatlanguage' gem. Return an identifier
+  # corresponding to the ISO-639-2 code for the
+  # language.
+  #
+  # Options:
+  #
+  # - (Array of Symbols) bias => Languages to bias
+  # toward when more than one language is detected
+  # with equal probability.
+  def self.language(entity, options = {})
+    options = DefaultOptions.merge(options)
+    @@detector ||= ::WhatLanguage.new(:possibilities)
+    possibilities = @@detector.process_text(entity.to_s)
+    lang = {}
+    possibilities.each do |k,v|
+      lang[k.intern] = v
+    end
+    max = lang.values.max
+    ordered = lang.select { |i,j| j == max }.keys
+    ordered.each do |l|
+      if options[:bias_toward].include?(l)
+        return l
       end
-      return ordered.first
     end
+    return ordered.first
   end
-end
+end

data/lib/treat/workers/extractors/name_tag/stanford.rb CHANGED Viewed

@@ -12,24 +12,27 @@ class Treat::Workers::Extractors::NameTag::Stanford
     pp = nil
-    lang = entity.language
+    language = entity.language
-    Treat::Loaders::Stanford.load(lang)
+    Treat::Loaders::Stanford.load(language)
     isolated_token = entity.is_a?(Treat::Entities::Token)
     tokens = isolated_token ? [entity] : entity.tokens
     ms = StanfordCoreNLP::Config::Models[:ner][language]
-    ms = Treat::Loaders::Stanford.model_path + '/' +
+    model_path = Treat.libraries.stanford.model_path ||
+    (Treat.paths.models + '/stanford/')
+    ms = model_path + '/' +
     StanfordCoreNLP::Config::ModelFolders[:ner] +
     ms['3class']
-    @@classifiers[lang] ||=
+    @@classifiers[language] ||=
     StanfordCoreNLP::CRFClassifier.
     getClassifier(ms)
     token_list = StanfordCoreNLP.get_list(tokens)
-    sentence = @@classifiers[lang].classify_sentence(token_list)
+    sentence = @@classifiers[language].
+    classify_sentence(token_list)
     i = 0
     n = 0