RubyGems - treat - Versions diffs - 2.0.3 → 2.0.4 - Mend

treat 2.0.3 → 2.0.4

Files changed (36) hide show

data/lib/treat/config/data/languages/agnostic.rb +6 -3
data/lib/treat/config/data/languages/english.rb +1 -1
data/lib/treat/config/data/workers/extractors.rb +8 -0
data/lib/treat/loaders/stanford.rb +2 -0
data/lib/treat/version.rb +1 -1
data/lib/treat/workers/extractors/distance/levenshtein.rb +35 -0
data/lib/treat/workers/extractors/name_tag/stanford.rb +4 -1
data/lib/treat/workers/extractors/similarity/jaro_winkler.rb +38 -0
data/lib/treat/workers/extractors/similarity/tf_idf.rb +19 -3
data/lib/treat/workers/extractors/time/chronic.rb +6 -41
data/lib/treat/workers/extractors/time/kronic.rb +20 -0
data/lib/treat/workers/extractors/time/nickel.rb +0 -15
data/lib/treat/workers/extractors/time/ruby.rb +2 -33
data/lib/treat/workers/lexicalizers/taggers/stanford.rb +11 -10
data/lib/treat/workers/processors/parsers/stanford.rb +60 -112
data/spec/entities/collection.rb +29 -25
data/spec/entities/document.rb +45 -44
data/spec/entities/entity.rb +295 -294
data/spec/entities/phrase.rb +21 -17
data/spec/entities/token.rb +43 -40
data/spec/entities/word.rb +5 -1
data/spec/entities/zone.rb +26 -22
data/spec/helper.rb +7 -2
data/spec/learning/data_set.rb +145 -141
data/spec/learning/export.rb +46 -42
data/spec/learning/problem.rb +114 -110
data/spec/learning/question.rb +46 -42
data/spec/treat.rb +41 -37
data/spec/workers/agnostic.rb +2 -2
data/spec/workers/english.rb +12 -12
metadata +7 -8
data/files/21552208.html +0 -786
data/files/nethttp-cheat-sheet-2940.html +0 -393
data/lib/treat/workers/extractors/similarity/levenshtein.rb +0 -36
data/spec/sandbox.rb +0 -294
data/spec/workers/examples/english/mathematicians/euler.html +0 -21

data/lib/treat/config/data/languages/agnostic.rb CHANGED Viewed

@@ -4,9 +4,10 @@
     'bson_ext', 'mongo', 'lda-ruby',
     'stanford-core-nlp', 'linguistics',
     'ruby-readability', 'whatlanguage',
-    'chronic', 'nickel', 'decisiontree',
+    'chronic', 'kronic', 'nickel', 'decisiontree',
     'rb-libsvm', 'ruby-fann', 'zip', 'loggability',
-    'tf-idf-similarity', 'narray', 'fastimage'
+    'tf-idf-similarity', 'narray', 'fastimage',
+    'fuzzy-string-match', 'levenshtein-ffi'
   ],
   workers: {
     learners: {
@@ -16,7 +17,9 @@
       keywords: [:tf_idf],
       language: [:what_language],
       topic_words: [:lda],
-      tf_idf: [:native]
+      tf_idf: [:native],
+      distance: [:levenshtein],
+      similarity: [:jaro_winkler, :tf_idf]
     },
     formatters: {
       serializers: [:xml, :yaml, :mongo],

data/lib/treat/config/data/languages/english.rb CHANGED Viewed

@@ -14,7 +14,7 @@
   ],
   workers: {
     extractors: {
-      time: [:chronic, :ruby, :nickel],
+      time: [:chronic, :kronic, :ruby, :nickel],
       topics: [:reuters],
       name_tag: [:stanford]
     },

data/lib/treat/config/data/workers/extractors.rb CHANGED Viewed

@@ -27,5 +27,13 @@
   tf_idf: {
     type: :annotator,
     targets: [:word]
+  },
+  similarity: {
+    type: :computer,
+    targets: [:entity]
+  },
+  distance: {
+    type: :computer,
+    targets: [:entity]
   }
 }

data/lib/treat/loaders/stanford.rb CHANGED Viewed

@@ -42,6 +42,8 @@ class Treat::Loaders::Stanford
       StanfordCoreNLP.log_file = '/dev/null'
     end
+    StanfordCoreNLP.bind
     @@loaded = true
   end

data/lib/treat/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Treat
   # The current version of Treat.
-  VERSION = "2.0.3"
+  VERSION = "2.0.4"
   # Treat requires Ruby >= 1.9.2
   if RUBY_VERSION < '1.9.2'

data/lib/treat/workers/extractors/distance/levenshtein.rb ADDED Viewed

@@ -0,0 +1,35 @@
+# The C extension uses char* strings, and so Unicode strings
+# will give incorrect distances. Need to provide a pure
+# implementation if that's the case (FIX).
+class Treat::Workers::Extractors::Distance::Levenshtein
+  require 'levenshtein'
+  DefaultOptions = {
+    ins_cost: 1,
+    del_cost: 1,
+    sub_cost: 1
+  }
+  @@matcher = nil
+  # Return the levensthein distance between
+  # two strings taking into account the costs
+  # of insertion, deletion, and substitution.
+  def self.distance(entity, options)
+    options = DefaultOptions.merge(options)
+    unless options[:to]
+      raise Treat::Exception, "Must supply " +
+      "a string/entity to compare to using " +
+      "the option :to for this worker."
+    end
+    a, b = entity.to_s, options[:to].to_s
+    Levenshtein.distance(a, b)
+  end
+end

data/lib/treat/workers/extractors/name_tag/stanford.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 # Named entity tag extraction using the Stanford NLP
 # Deterministic Coreference Resolver, which implements a
 # multi-pass sieve coreference resolution (or anaphora
-# resolution) system.
+# resolution) system based on conditional random fields.
 #
 # Original paper: Heeyoung Lee, Yves Peirsman, Angel
 # Chang, Nathanael Chambers, Mihai Surdeanu, Dan Jurafsky.
@@ -24,6 +24,9 @@ class Treat::Workers::Extractors::NameTag::Stanford
     unless classifier = @@classifiers[language]
       model = Treat::Loaders::Stanford.find_model(:ner, language)
+      unless StanfordCoreNLP.const_defined?('CRFClassifier')
+        StanfordCoreNLP.load_class('CRFClassifier', 'edu.stanford.nlp.ie.crf')
+      end
       classifier = StanfordCoreNLP::CRFClassifier.getClassifier(model)
       @@classifiers[language] = classifier
     end

data/lib/treat/workers/extractors/similarity/jaro_winkler.rb ADDED Viewed

@@ -0,0 +1,38 @@
+# Similarity measure for short strings such as person names.
+# C extension won't work for Unicode strings; need to set
+# extension to "pure" in that case (FIX).
+class Treat::Workers::Extractors::Similarity::JaroWinkler
+  require 'fuzzystringmatch'
+  DefaultOptions = {
+    threshold: 0.7,
+    implementation: nil
+  }
+  @@matcher = nil
+  def self.similarity(entity, options={})
+    options = DefaultOptions.merge(options)
+    unless options[:to]
+      raise Treat::Exception, "Must supply " +
+      "a string/entity to compare to using " +
+      "the option :to for this worker."
+    end
+    unless @@matcher
+      impl = options[:implementation]
+      impl ||= defined?(JRUBY_VERSION) ? :pure : :native
+      klass = FuzzyStringMatch::JaroWinkler
+      @@matcher = klass.create(impl)
+    end
+    a, b = entity.to_s, options[:to].to_s
+    @@matcher.getDistance(a, b)
+  end
+end

data/lib/treat/workers/extractors/similarity/tf_idf.rb CHANGED Viewed

@@ -2,12 +2,28 @@
 class Treat::Workers::Extractors::Similarity::TfIdf
   require 'tf-idf-similarity'
+  def self.similarity(entity, options={})
-  @collections = {}
+    raise 'Not currently implemented.'
+    unless options[:to] &&
+           options[:to].type == :document
+      raise Treat::Exception, 'Must supply ' +
+      'a document to compare to using ' +
+      'the option :to for this worker.'
+    end
-  def self.tf_idf(collection, options={})
+    unless options[:to].parent_collection &&
+           entity.parent_collection
+      raise Treat::Exception, 'The TF*IDF ' +
+      'similarity algorithm can only be applied ' +
+      'to documents that are inside collections.'
+    end
     coll = TfIdfSimilarity::Collection.new
-    collection.each_document do |doc|
+    entity.each_document do |doc|
       tdoc = TfIdfSimilarity::Document.new(doc.to_s)
       term_counts = Hash.new(0)
       doc.each_word do |word|

data/lib/treat/workers/extractors/time/chronic.rb CHANGED Viewed

@@ -2,52 +2,17 @@
 # Ruby natural language date parser.
 class Treat::Workers::Extractors::Time::Chronic
-  # Require the 'chronic' gem.
-  silence_warnings { require 'chronic' }
-  # Require the Ruby DateTime module
+  require 'chronic'
   require 'date'
+  DefaultOptions = {guess: true}
   # Return the date information contained within
   # the entity by parsing it with the 'chronic' gem.
-  #
-  # Options: none.
   def self.time(entity, options = {})
-    s = entity.to_s
-    return if s =~ /^[0-9]+$/
-    time = nil
-    silence_warnings do
-      time = ::Chronic.parse(s, {:guess => true})
-    end
-    if entity.has_parent? && remove_time_from_ancestors(entity, time)
-      nil
-    else
-      time
-    end
-  end
-  # Keeps the lowest-level time annotations that do
-  # not conflict with a higher time annotation.
-  # Returns true if the entity conflicts with a
-  # higher-level time annotation.
-  def self.remove_time_from_ancestors(entity, time)
-    entity.ancestors_with_type(:phrase).each do |a|
-      next if !a.has?(:time)
-      unless a.get(:time) == time
-        return true
-      end
-      a.unset(:time)
-    end
-    false
+    options = DefaultOptions.merge(options)
+    time = ::Chronic.parse(entity.to_s, options)
+    time ? DateTime.parse(time.to_s) : nil
   end
 end

data/lib/treat/workers/extractors/time/kronic.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# Time/date extraction using a simple rule-based library.
+#
+# Supported formats: Today, yesterday, tomorrow,
+# last thursday, this thursday, 14 Sep, 14 June 2010.
+# Any dates without a year are assumed to be in the past.
+class Treat::Workers::Extractors::Time::Kronic
+  require 'kronic'
+  require 'date'
+  # Return the date information contained within
+  # the entity by parsing it with the 'chronic' gem.
+  #
+  # Options: none.
+  def self.time(entity, options = {})
+    time = Kronic.parse(entity.to_s)
+    time.is_a?(DateTime) ? time : nil
+  end
+end

data/lib/treat/workers/extractors/time/nickel.rb CHANGED Viewed

@@ -47,7 +47,6 @@ class Treat::Workers::Extractors::Time::Nickel
     occ.interval : :none
     time_recurrence_interval = interval
     s = [occ.start_date, occ.start_time]
     ds = [s[0].year, s[0].month, s[0].day] if s[0]
     ts = [s[1].hour, s[1].minute, s[1].second] if s[1]
@@ -77,18 +76,4 @@ class Treat::Workers::Extractors::Time::Nickel
   end
-  # Keeps the lowest-level time annotations that do
-  # not conflict with a higher time annotation.
-  # Returns true if the entity conflicts with a
-  # higher-level time annotation.
-  def self.remove_time_from_ancestors(entity, time)
-    entity.ancestors_with_type(:phrase).each do |a|
-      next if !a.has?(:time)
-      return false unless a.get(:time).to_s == time.to_s
-      a.unset(:time, :time_recurrence,
-      :time_recurrence_interval, :end_time)
-    end
-    true
-  end
 end

data/lib/treat/workers/extractors/time/ruby.rb CHANGED Viewed

@@ -2,7 +2,7 @@
 # DateTime.parse() method.
 class Treat::Workers::Extractors::Time::Ruby
-  # Require Ruby's date module.
   require 'date'
   # Return a DateTime object representing the date/time
@@ -13,42 +13,11 @@ class Treat::Workers::Extractors::Time::Ruby
   #
   # Options: none.
   def self.time(entity, options = {})
-    s = entity.to_s
-    return if s =~ /^[0-9]+$/
     begin
-      time = ::DateTime.parse(s)
-      if  entity.has_parent? &&
-        remove_time_from_ancestors(entity, time)
-        nil
-      else
-        time
-      end
+      DateTime.parse(entity.to_s)
     rescue
       nil
     end
   end
-  # Keeps the lowest-level time annotations that do
-  # not conflict with a higher time annotation.
-  # Returns true if the entity conflicts with a
-  # higher-level time annotation.
-  def self.remove_time_from_ancestors(entity, time)
-    entity.ancestors_with_type(:phrase).each do |a|
-      next if !a.has?(:time)
-      unless a.get(:time) == time
-        return true
-      end
-      a.unset(:time)
-    end
-    false
-  end
 end

data/lib/treat/workers/lexicalizers/taggers/stanford.rb CHANGED Viewed

@@ -1,10 +1,10 @@
-# POS tagging using (i) explicit use of both preceding
-# and following tag contexts via a dependency network
-# representation, (ii) broad use of lexical features,
-# including jointly conditioning on multiple consecutive
-# words, (iii) effective use of priors in conditional
-# loglinear models, and (iv) ﬁne-grained modeling of
-# unknown word features.
+# POS tagging using a maximum entropy model, with (i)
+# explicit use of both preceding and following tag
+# contexts via a dependency network representation,
+# (ii) broad use of lexical features, including jointly
+# conditioning on multiple consecutive words, (iii)
+# effective use of priors in conditional loglinear models,
+# and (iv) ﬁne-grained modeling of unknown word features.
 #
 # Original paper: Toutanova, Manning, Klein and Singer.
 # 2003. Feature-Rich Part-of-Speech Tagging with a
@@ -21,9 +21,6 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
     :tagger_model => nil
   }
-  # Shortcut for gem config.
-  Config = StanfordCoreNLP::Config
   # Tag the word using one of the Stanford taggers.
   def self.tag(entity, options = {})
@@ -64,6 +61,10 @@ class Treat::Workers::Lexicalizers::Taggers::Stanford
   def self.init_tagger(language)
     unless @@taggers[language]
       Treat::Loaders::Stanford.load(language)
+      unless StanfordCoreNLP.const_defined?('MaxentTagger')
+        StanfordCoreNLP.load_class('MaxentTagger',
+        'edu.stanford.nlp.tagger.maxent')
+      end
       model = Treat::Loaders::Stanford.find_model(:pos,language)
       tagger = StanfordCoreNLP::MaxentTagger.new(model)
       @@taggers[language] = tagger

data/lib/treat/workers/processors/parsers/stanford.rb CHANGED Viewed

@@ -1,140 +1,88 @@
-# Parsing using an interface to a Java implementation
-# of probabilistic natural language parsers, both
-# optimized PCFG and lexicalized dependency parsers,
-# and a lexicalized PCFG parser.
-#
-# Original paper: Dan Klein and Christopher D.
-# Manning. 2003. Accurate Unlexicalized Parsing.
-# Proceedings of the 41st Meeting of the Association
+# Parsing using an interface to a Java implementation
+# of probabilistic natural language parsers, both
+# optimized PCFG and lexicalized dependency parsers,
+# and a lexicalized PCFG parser.
+#
+# Original paper: Dan Klein and Christopher D.
+# Manning. 2003. Accurate Unlexicalized Parsing.
+# Proceedings of the 41st Meeting of the Association
 # for Computational Linguistics, pp. 423-430.
 class Treat::Workers::Processors::Parsers::Stanford
   Pttc = Treat.tags.aligned.phrase_tags_to_category
   # Hold one instance of the pipeline per language.
   @@parsers = {}
-  DefaultOptions = {
-    :parser_model => nil,
-    :tagger_model => nil
-  }
+  DefaultOptions = { model: nil }
   # Parse the entity using the Stanford parser.
-  #
-  # Options:
-  #
-  # - (Boolean) :silent => whether to silence the output
-  #   of the JVM.
-  # - (String) :log_file => a filename to log output to
-  # instead of displaying it.
   def self.parse(entity, options = {})
-    val, lang = entity.to_s, entity.language
-    init(lang, options) unless @@parsers[lang]
-    entity.check_hasnt_children
+    val, lang = entity.to_s, entity.language.intern
+    Treat::Loaders::Stanford.load(lang)
     tag_set = StanfordCoreNLP::Config::TagSets[lang]
-    text = ::StanfordCoreNLP::Annotation.new(val)
-    @@parsers[lang].annotate(text)
-    text.get(:sentences).each do |s|
-      if entity.is_a?(Treat::Entities::Sentence) ||
-        entity.is_a?(Treat::Entities::Phrase)
-        tag = s.get(:category).to_s
-        tag_s, tag_opt = *tag.split('-')
-        tag_s ||= 'S'
-        entity.set :tag, tag_s
-        entity.set :tag_opt, tag_opt if tag_opt
-        recurse(s.get(:tree).children[0], entity, tag_set)
-        break ####### ? FIX
-      else
-        recurse(s.get(:tree), entity, tag_set)
-      end
+    list = get_token_list(entity)
+    entity.remove_all!
+    model_file     = options[:model] ||
+    StanfordCoreNLP::Config::Models[:parse][lang]
+    unless @@parsers[lang] && @@parsers[lang][model_file]
+      model_path   = Treat.libraries.stanford.model_path ||
+                     StanfordCoreNLP.model_path
+      model_folder = StanfordCoreNLP::Config::ModelFolders[:parse]
+      model = File.join(model_path, model_folder, model_file)
+      @@parsers[lang] ||= {}
+      options = StanfordCoreNLP::Options.new
+      parser = StanfordCoreNLP::LexicalizedParser
+      .getParserFromFile(model, options)
+      @@parsers[lang][model_file] = parser
     end
-    entity.set :tag_set, tag_set
-  end
+    parser = @@parsers[lang][model_file]
+    text = parser.apply(list)
+    recurse(text.children[0], entity, tag_set)
+    entity.set :tag_set, tag_set
-  def self.init(lang, options)
-    Treat::Loaders::Stanford.load(lang)
-    options = DefaultOptions.merge(options)
-    StanfordCoreNLP.use(lang.intern)
-    if options[:tagger_model]
-      StanfordCoreNLP.set_model('pos.model', options[:tagger_model])
-    end
-    if options[:parser_model]
-      StanfordCoreNLP.set_model('parser.model', options[:parser_model])
-    end
-    annotators = [:tokenize, :ssplit, :pos, :lemma, :parse]
-    @@parsers[lang] = StanfordCoreNLP.load(*annotators)
   end
-  # Helper method which recurses the tree supplied by
-  # the Stanford parser.
-  def self.recurse(java_node, ruby_node, tag_set, additional_tags = [])
-    if java_node.num_children == 0
-      label = java_node.label
-      tag = label.get(:part_of_speech).to_s
-      tag_s, tag_opt = *tag.split('-')
-      tag_s ||= ''
-      ruby_node.value = java_node.value.to_s.strip
-      ruby_node.set :tag, tag_s
-      ruby_node.set :tag_opt, tag_opt if tag_opt
-      ruby_node.set :lemma, label.get(:lemma).to_s
-      additional_tags.each do |t|
-        lt = label.get(t)
-        ruby_node.set t, lt.to_s if lt
-      end
-      ruby_node
-    else
-      if java_node.num_children == 1 &&
-        java_node.children[0].num_children == 0
-        recurse(java_node.children[0],
-        ruby_node, tag_set, additional_tags)
-        return
-      end
+  def self.recurse(java_node, ruby_node, tag_set)
+    java_node.children.each do |java_child|
-      java_node.children.each do |java_child|
-        label = java_child.label
-        tag = label.get(:category).to_s
-        tag_s, tag_opt = *tag.split('-')
-        tag_s ||= ''
-        if Pttc[tag_s] && Pttc[tag_s][tag_set]
-          ruby_child = Treat::Entities::Phrase.new
-        else
-          l = java_child.children[0].to_s
-          v = java_child.children[0].value.to_s.strip
-          # Mhmhmhmhmhm FIX!
-          val = (l == v) ? v :  l.split(' ')[-1].gsub(')', '')
-          ruby_child = Treat::Entities::Token.from_string(val)
-        end
+      label = java_child.label
+      tag = label.get(:category).to_s
-        ruby_child.set :tag, tag_s
-        ruby_child.set :tag_opt, tag_opt if tag_opt
+      if Pttc[tag] && Pttc[tag][tag_set]
+        ruby_child = Treat::Entities::Phrase.new
+        ruby_child.set :tag, tag
         ruby_node << ruby_child
         unless java_child.children.empty?
-          recurse(java_child, ruby_child, tag_set, additional_tags)
+          recurse(java_child, ruby_child, tag_set)
         end
+      else
+        val = java_child.children[0].to_s
+        ruby_child = Treat::Entities::Token.from_string(val)
+        ruby_child.set :tag, tag
+        ruby_node << ruby_child
       end
     end
   end
+  def self.get_token_list(entity)
+    list = StanfordCoreNLP::ArrayList.new
+    entity.tokens.each do |token|
+      list.add(StanfordCoreNLP::Word.new(token.to_s))
+    end
+    list
+  end
 end