RubyGems - treat - Versions diffs - 2.0.3 → 2.0.4 - Mend

treat 2.0.3 → 2.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (36) hide show

data/lib/treat/config/data/languages/agnostic.rb +6 -3
data/lib/treat/config/data/languages/english.rb +1 -1
data/lib/treat/config/data/workers/extractors.rb +8 -0
data/lib/treat/loaders/stanford.rb +2 -0
data/lib/treat/version.rb +1 -1
data/lib/treat/workers/extractors/distance/levenshtein.rb +35 -0
data/lib/treat/workers/extractors/name_tag/stanford.rb +4 -1
data/lib/treat/workers/extractors/similarity/jaro_winkler.rb +38 -0
data/lib/treat/workers/extractors/similarity/tf_idf.rb +19 -3
data/lib/treat/workers/extractors/time/chronic.rb +6 -41
data/lib/treat/workers/extractors/time/kronic.rb +20 -0
data/lib/treat/workers/extractors/time/nickel.rb +0 -15
data/lib/treat/workers/extractors/time/ruby.rb +2 -33
data/lib/treat/workers/lexicalizers/taggers/stanford.rb +11 -10
data/lib/treat/workers/processors/parsers/stanford.rb +60 -112
data/spec/entities/collection.rb +29 -25
data/spec/entities/document.rb +45 -44
data/spec/entities/entity.rb +295 -294
data/spec/entities/phrase.rb +21 -17
data/spec/entities/token.rb +43 -40
data/spec/entities/word.rb +5 -1
data/spec/entities/zone.rb +26 -22
data/spec/helper.rb +7 -2
data/spec/learning/data_set.rb +145 -141
data/spec/learning/export.rb +46 -42
data/spec/learning/problem.rb +114 -110
data/spec/learning/question.rb +46 -42
data/spec/treat.rb +41 -37
data/spec/workers/agnostic.rb +2 -2
data/spec/workers/english.rb +12 -12
metadata +7 -8
data/files/21552208.html +0 -786
data/files/nethttp-cheat-sheet-2940.html +0 -393
data/lib/treat/workers/extractors/similarity/levenshtein.rb +0 -36
data/spec/sandbox.rb +0 -294
data/spec/workers/examples/english/mathematicians/euler.html +0 -21

data/lib/treat/config/data/languages/agnostic.rb CHANGED Viewed

@@ -4,9 +4,10 @@
     'bson_ext', 'mongo', 'lda-ruby',
     'stanford-core-nlp', 'linguistics',
     'ruby-readability', 'whatlanguage',
-    'chronic', 'nickel', 'decisiontree',
+    'chronic', 'kronic', 'nickel', 'decisiontree',
     'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
-    'tf-idf-similarity', 'narray', 'fastimage'
+    'tf-idf-similarity', 'narray', 'fastimage',
+    'fuzzy-string-match', 'levenshtein-ffi'
   ],
   workers: {
     learners: {
@@ -16,7 +17,9 @@
       keywords: [:tf_idf],
       language: [:what_language],
       topic_words: [:lda],
-      tf_idf: [:native]
+      tf_idf: [:native],
+      distance: [:levenshtein],
+      similarity: [:jaro_winkler, :tf_idf]
     },
     formatters: {
       serializers: [:xml, :yaml, :mongo],

data/lib/treat/config/data/languages/english.rb CHANGED Viewed

@@ -14,7 +14,7 @@
   ],
   workers: {
     extractors: {
-      time: [:chronic, :ruby, :nickel],
+      time: [:chronic, :kronic, :ruby, :nickel],
       topics: [:reuters],
       name_tag: [:stanford]
     },

data/lib/treat/config/data/workers/extractors.rb CHANGED Viewed

@@ -27,5 +27,13 @@
   tf_idf: {
     type: :annotator,
     targets: [:word]
+  },
+  similarity: {
+    type: :computer,
+    targets: [:entity]
+  },
+  distance: {
+    type: :computer,
+    targets: [:entity]
   }
 }

data/lib/treat/loaders/stanford.rb CHANGED Viewed

@@ -42,6 +42,8 @@ class Treat::Loaders::Stanford
       StanfordCoreNLP.log_file = '/dev/null'
     end
+    StanfordCoreNLP.bind
     @@loaded = true
   end

data/lib/treat/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Treat
   # The current version of Treat.
-  VERSION = "2.0.3"
+  VERSION = "2.0.4"
   # Treat requires Ruby >= 1.9.2
   if RUBY_VERSION < '1.9.2'

data/lib/treat/workers/extractors/distance/levenshtein.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# The C extension uses char* strings, and so Unicode strings
+# will give incorrect distances. Need to provide a pure
+# implementation if that's the case (FIX).
+class Treat::Workers::Extractors::Distance::Levenshtein
+  require 'levenshtein'
+  DefaultOptions = {
+    ins_cost: 1,
+    del_cost: 1,
+    sub_cost: 1
+  }
+  @@matcher = nil
+  # Return the levensthein distance between
+  # two strings taking into account the costs
+  # of insertion, deletion, and substitution.
+  def self.distance(entity, options)
+    options = DefaultOptions.merge(options)
+    unless options[:to]
+      raise Treat::Exception, "Must supply " +
+      "a string/entity to compare to using " +
+      "the option :to for this worker."
+    end
+    a, b = entity.to_s, options[:to].to_s
+    Levenshtein.distance(a, b)
+  end
+end

data/lib/treat/workers/extractors/name_tag/stanford.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # Named entity tag extraction using the Stanford NLP
 # Deterministic Coreference Resolver, which implements a
 # multi-pass sieve coreference resolution (or anaphora
-# resolution) system.
+# resolution) system based on conditional random fields.
 #
 # Original paper: Heeyoung Lee, Yves Peirsman, Angel
 # Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
@@ -24,6 +24,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
     unless classifier = @@classifiers[language]
       model = Treat::Loaders::Stanford.find_model(:ner, language)
+      unless StanfordCoreNLP.const_defined?('CRFClassifier')
+        StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
+      end
       classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
       @@classifiers[language] = classifier
     end

data/lib/treat/workers/extractors/similarity/jaro_winkler.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# Similarity measure for short strings such as person names.
+# C extension won't work for Unicode strings; need to set
+# extension to "pure" in that case (FIX).
+class Treat::Workers::Extractors::Similarity::JaroWinkler
+  require 'fuzzystringmatch'
+  DefaultOptions = {
+    threshold: 0.7,
+    implementation: nil
+  }
+  @@matcher = nil
+  def self.similarity(entity, options={})
+    options = DefaultOptions.merge(options)
+    unless options[:to]
+      raise Treat::Exception, "Must supply " +
+      "a string/entity to compare to using " +
+      "the option :to for this worker."
+    end
+    unless @@matcher
+      impl = options[:implementation]
+      impl ||= defined?(JRUBY_VERSION) ? :pure : :native
+      klass = FuzzyStringMatch::JaroWinkler
+      @@matcher = klass.create(impl)
+    end
+    a, b = entity.to_s, options[:to].to_s
+    @@matcher.getDistance(a, b)
+  end
+end

data/lib/treat/workers/extractors/similarity/tf_idf.rb CHANGED Viewed

@@ -2,12 +2,28 @@
 class Treat::Workers::Extractors::Similarity::TfIdf
   require 'tf-idf-similarity'
+  def self.similarity(entity, options={})
-  @collections = {}
+    raise 'Not currently implemented.'
+    unless options[:to] &&
+           options[:to].type == :document
+      raise Treat::Exception, 'Must supply ' +
+      'a document to compare to using ' +
+      'the option :to for this worker.'
+    end
-  def self.tf_idf(collection, options={})
+    unless options[:to].parent_collection &&
+           entity.parent_collection
+      raise Treat::Exception, 'The TF*IDF ' +
+      'similarity algorithm can only be applied ' +
+      'to documents that are inside collections.'
+    end
     coll = TfIdfSimilarity::Collection.new
-    collection.each_document do |doc|
+    entity.each_document do |doc|
       tdoc = TfIdfSimilarity::Document.new(doc.to_s)
       term_counts = Hash.new(0)
       doc.each_word do |word|

data/lib/treat/workers/extractors/time/chronic.rb CHANGED Viewed

@@ -2,52 +2,17 @@
 # Ruby natural language date parser.
 class Treat::Workers::Extractors::Time::Chronic
-  # Require the 'chronic' gem.
-  silence_warnings { require 'chronic' }
-  # Require the Ruby DateTime module
+  require 'chronic'
   require 'date'
+  DefaultOptions = {guess: true}
   # Return the date information contained within
   # the entity by parsing it with the 'chronic' gem.
-  #
-  # Options: none.
   def self.time(entity, options = {})
-    s = entity.to_s
-    return if s =~ /^[0-9]+$/
-    time = nil
-    silence_warnings do
-      time = ::Chronic.parse(s, {:guess => true})
-    end
-    if entity.has_parent? && remove_time_from_ancestors(entity, time)
-      nil
-    else
-      time
-    end
-  end
-  # Keeps the lowest-level time annotations that do
-  # not conflict with a higher time annotation.
-  # Returns true if the entity conflicts with a
-  # higher-level time annotation.
-  def self.remove_time_from_ancestors(entity, time)
-    entity.ancestors_with_type(:phrase).each do |a|
-      next if !a.has?(:time)
-      unless a.get(:time) == time
-        return true
-      end
-      a.unset(:time)
-    end
-    false
+    options = DefaultOptions.merge(options)
+    time = ::Chronic.parse(entity.to_s, options)
+    time ? DateTime.parse(time.to_s) : nil
   end
 end

data/lib/treat/workers/extractors/time/kronic.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# Time/date extraction using a simple rule-based library.
+#
+# Supported formats: Today, yesterday, tomorrow,
+# last thursday, this thursday, 14 Sep, 14 June 2010.
+# Any dates without a year are assumed to be in the past.
+class Treat::Workers::Extractors::Time::Kronic
+  require 'kronic'
+  require 'date'
+  # Return the date information contained within
+  # the entity by parsing it with the 'chronic' gem.
+  #
+  # Options: none.
+  def self.time(entity, options = {})
+    time = Kronic.parse(entity.to_s)
+    time.is_a?(DateTime) ? time : nil
+  end
+end

data/lib/treat/workers/extractors/time/nickel.rb CHANGED Viewed

@@ -47,7 +47,6 @@ class Treat::Workers::Extractors::Time::Nickel
     occ.interval : :none
     time_recurrence_interval = interval
     s = [occ.start_date, occ.start_time]
     ds = [s[0].year, s[0].month, s[0].day] if s[0]
     ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
@@ -77,18 +76,4 @@ class Treat::Workers::Extractors::Time::Nickel
   end
-  # Keeps the lowest-level time annotations that do
-  # not conflict with a higher time annotation.
-  # Returns true if the entity conflicts with a
-  # higher-level time annotation.
-  def self.remove_time_from_ancestors(entity, time)
-    entity.ancestors_with_type(:phrase).each do |a|
-      next if !a.has?(:time)
-      return false unless a.get(:time).to_s == time.to_s
-      a.unset(:time, :time_recurrence,
-      :time_recurrence_interval, :end_time)
-    end
-    true
-  end
 end

data/lib/treat/workers/extractors/time/ruby.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 # DateTime.parse() method.
 class Treat::Workers::Extractors::Time::Ruby
-  # Require Ruby's date module.
   require 'date'
   # Return a DateTime object representing the date/time
@@ -13,42 +13,11 @@ class Treat::Workers::Extractors::Time::Ruby
   #
   # Options: none.
   def self.time(entity, options = {})
-    s = entity.to_s
-    return if s =~ /^[0-9]+$/
     begin
-      time = ::DateTime.parse(s)
-      if  entity.has_parent? &&
-        remove_time_from_ancestors(entity, time)
-        nil
-      else
-        time
-      end
+      DateTime.parse(entity.to_s)
     rescue
       nil
     end
   end
-  # Keeps the lowest-level time annotations that do
-  # not conflict with a higher time annotation.
-  # Returns true if the entity conflicts with a
-  # higher-level time annotation.
-  def self.remove_time_from_ancestors(entity, time)
-    entity.ancestors_with_type(:phrase).each do |a|
-      next if !a.has?(:time)
-      unless a.get(:time) == time
-        return true
-      end
-      a.unset(:time)
-    end
-    false
-  end
 end

data/lib/treat/workers/lexicalizers/taggers/stanford.rb CHANGED Viewed

@@ -1,10 +1,10 @@
-# POS tagging using (i) explicit use of both preceding
-# and following tag contexts via a dependency network
-# representation, (ii) broad use of lexical features,
-# including jointly conditioning on multiple consecutive
-# words, (iii) effective use of priors in conditional
-# loglinear models, and (iv) ﬁne-grained modeling of
-# unknown word features.
+# POS tagging using a maximum entropy model, with (i)
+# explicit use of both preceding and following tag
+# contexts via a dependency network representation,
+# (ii) broad use of lexical features, including jointly
+# conditioning on multiple consecutive words, (iii)
+# effective use of priors in conditional loglinear models,
+# and (iv) ﬁne-grained modeling of unknown word features.
 #
 # Original paper: Toutanova, Manning, Klein and Singer.
 # 2003. Feature-Rich Part-of-Speech Tagging with a
@@ -21,9 +21,6 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
     :tagger_model => nil
   }
-  # Shortcut for gem config.
-  Config = StanfordCoreNLP::Config
   # Tag the word using one of the Stanford taggers.
   def self.tag(entity, options = {})
@@ -64,6 +61,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
   def self.init_tagger(language)
     unless @@taggers[language]
       Treat::Loaders::Stanford.load(language)
+      unless StanfordCoreNLP.const_defined?('MaxentTagger')
+        StanfordCoreNLP.load_class('MaxentTagger',
+        'edu.stanford.nlp.tagger.maxent')
+      end
       model = Treat::Loaders::Stanford.find_model(:pos,language)
       tagger = StanfordCoreNLP::MaxentTagger.new(model)
       @@taggers[language] = tagger

data/lib/treat/workers/processors/parsers/stanford.rb CHANGED Viewed

@@ -1,140 +1,88 @@
-# Parsing using an interface to a Java implementation
-# of probabilistic natural language parsers, both
-# optimized PCFG and lexicalized dependency parsers,
-# and a lexicalized PCFG parser.
-#
-# Original paper: Dan Klein and Christopher D.
-# Manning. 2003. Accurate Unlexicalized Parsing.
-# Proceedings of the 41st Meeting of the Association
+# Parsing using an interface to a Java implementation
+# of probabilistic natural language parsers, both
+# optimized PCFG and lexicalized dependency parsers,
+# and a lexicalized PCFG parser.
+#
+# Original paper: Dan Klein and Christopher D.
+# Manning. 2003. Accurate Unlexicalized Parsing.
+# Proceedings of the 41st Meeting of the Association
 # for Computational Linguistics, pp. 423-430.
 class Treat::Workers::Processors::Parsers::Stanford
   Pttc = Treat.tags.aligned.phrase_tags_to_category
   # Hold one instance of the pipeline per language.
   @@parsers = {}
-  DefaultOptions = {
-    :parser_model => nil,
-    :tagger_model => nil
-  }
+  DefaultOptions = { model: nil }
   # Parse the entity using the Stanford parser.
-  #
-  # Options:
-  #
-  # - (Boolean) :silent => whether to silence the output
-  #   of the JVM.
-  # - (String) :log_file => a filename to log output to
-  # instead of displaying it.
   def self.parse(entity, options = {})
-    val, lang = entity.to_s, entity.language
-    init(lang, options) unless @@parsers[lang]
-    entity.check_hasnt_children
+    val, lang = entity.to_s, entity.language.intern
+    Treat::Loaders::Stanford.load(lang)
     tag_set = StanfordCoreNLP::Config::TagSets[lang]
-    text = ::StanfordCoreNLP::Annotation.new(val)
-    @@parsers[lang].annotate(text)
-    text.get(:sentences).each do |s|
-      if entity.is_a?(Treat::Entities::Sentence) ||
-        entity.is_a?(Treat::Entities::Phrase)
-        tag = s.get(:category).to_s
-        tag_s, tag_opt = *tag.split('-')
-        tag_s ||= 'S'
-        entity.set :tag, tag_s
-        entity.set :tag_opt, tag_opt if tag_opt
-        recurse(s.get(:tree).children[0], entity, tag_set)
-        break ####### ? FIX
-      else
-        recurse(s.get(:tree), entity, tag_set)
-      end
+    list = get_token_list(entity)
+    entity.remove_all!
+    model_file     = options[:model] ||
+    StanfordCoreNLP::Config::Models[:parse][lang]
+    unless @@parsers[lang] && @@parsers[lang][model_file]
+      model_path   = Treat.libraries.stanford.model_path ||
+                     StanfordCoreNLP.model_path
+      model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
+      model = File.join(model_path, model_folder, model_file)
+      @@parsers[lang] ||= {}
+      options = StanfordCoreNLP::Options.new
+      parser = StanfordCoreNLP::LexicalizedParser
+      .getParserFromFile(model, options)
+      @@parsers[lang][model_file] = parser
     end
-    entity.set :tag_set, tag_set
-  end
+    parser = @@parsers[lang][model_file]
+    text = parser.apply(list)
+    recurse(text.children[0], entity, tag_set)
+    entity.set :tag_set, tag_set
-  def self.init(lang, options)
-    Treat::Loaders::Stanford.load(lang)
-    options = DefaultOptions.merge(options)
-    StanfordCoreNLP.use(lang.intern)
-    if options[:tagger_model]
-      StanfordCoreNLP.set_model('pos.model', options[:tagger_model])
-    end
-    if options[:parser_model]
-      StanfordCoreNLP.set_model('parser.model', options[:parser_model])
-    end
-    annotators = [:tokenize, :ssplit, :pos, :lemma, :parse]
-    @@parsers[lang] = StanfordCoreNLP.load(*annotators)
   end
-  # Helper method which recurses the tree supplied by
-  # the Stanford parser.
-  def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
-    if java_node.num_children == 0
-      label = java_node.label
-      tag = label.get(:part_of_speech).to_s
-      tag_s, tag_opt = *tag.split('-')
-      tag_s ||= ''
-      ruby_node.value = java_node.value.to_s.strip
-      ruby_node.set :tag, tag_s
-      ruby_node.set :tag_opt, tag_opt if tag_opt
-      ruby_node.set :lemma, label.get(:lemma).to_s
-      additional_tags.each do |t|
-        lt = label.get(t)
-        ruby_node.set t, lt.to_s if lt
-      end
-      ruby_node
-    else
-      if java_node.num_children == 1 &&
-        java_node.children[0].num_children == 0
-        recurse(java_node.children[0],
-        ruby_node, tag_set, additional_tags)
-        return
-      end
+  def self.recurse(java_node, ruby_node, tag_set)
+    java_node.children.each do |java_child|
-      java_node.children.each do |java_child|
-        label = java_child.label
-        tag = label.get(:category).to_s
-        tag_s, tag_opt = *tag.split('-')
-        tag_s ||= ''
-        if Pttc[tag_s] && Pttc[tag_s][tag_set]
-          ruby_child = Treat::Entities::Phrase.new
-        else
-          l = java_child.children[0].to_s
-          v = java_child.children[0].value.to_s.strip
-          # Mhmhmhmhmhm FIX!
-          val = (l == v) ? v :  l.split(' ')[-1].gsub(')', '')
-          ruby_child = Treat::Entities::Token.from_string(val)
-        end
+      label = java_child.label
+      tag = label.get(:category).to_s
-        ruby_child.set :tag, tag_s
-        ruby_child.set :tag_opt, tag_opt if tag_opt
+      if Pttc[tag] && Pttc[tag][tag_set]
+        ruby_child = Treat::Entities::Phrase.new
+        ruby_child.set :tag, tag
         ruby_node << ruby_child
         unless java_child.children.empty?
-          recurse(java_child, ruby_child, tag_set, additional_tags)
+          recurse(java_child, ruby_child, tag_set)
         end
+      else
+        val = java_child.children[0].to_s
+        ruby_child = Treat::Entities::Token.from_string(val)
+        ruby_child.set :tag, tag
+        ruby_node << ruby_child
       end
     end
   end
+  def self.get_token_list(entity)
+    list = StanfordCoreNLP::ArrayList.new
+    entity.tokens.each do |token|
+      list.add(StanfordCoreNLP::Word.new(token.to_s))
+    end
+    list
+  end
 end