RubyGems - treat - Versions diffs - 2.0.4 → 2.0.5 - Mend

treat 2.0.4 → 2.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (25) hide show

data/README.md +7 -0
data/files/21552208.html +797 -0
data/files/nethttp-cheat-sheet-2940.html +393 -0
data/lib/treat/config/data/core.rb +1 -1
data/lib/treat/config/data/languages/english.rb +1 -1
data/lib/treat/config/data/languages/german.rb +2 -0
data/lib/treat/config/data/libraries.rb +4 -0
data/lib/treat/entities/entity/buildable.rb +4 -6
data/lib/treat/helpers/string.rb +1 -1
data/lib/treat/loaders/bind_it.rb +48 -0
data/lib/treat/loaders/open_nlp.rb +12 -0
data/lib/treat/loaders/stanford.rb +7 -46
data/lib/treat/proxies/proxy.rb +3 -2
data/lib/treat/version.rb +1 -1
data/lib/treat/workers/formatters/readers/autoselect.rb +1 -1
data/lib/treat/workers/formatters/readers/document.rb +17 -0
data/lib/treat/workers/formatters/unserializers/xml.rb +1 -0
data/lib/treat/workers/groupable.rb +1 -1
data/lib/treat/workers/processors/tokenizers/{maxent.rb → open_nlp.rb} +5 -4
data/spec/entities/document.rb +2 -2
data/spec/helper.rb +4 -0
data/spec/sandbox.rb +306 -0
data/spec/workers/examples/english/mathematicians/euler.html +21 -0
data/spec/workers/examples/english/mathematicians/pythagoras.docx +0 -0
metadata +28 -4

data/lib/treat/helpers/string.rb CHANGED Viewed

@@ -54,7 +54,7 @@ class Treat::Helpers::String
       if @@cc_cache[o_phrase]
         return @@cc_cache[o_phrase]
       end
-      if Treat.core.acronyms.include?(phrase)
+      if Treat.core.acronyms.include?(phrase.downcase)
         phrase = phrase.upcase
       else
         phrase.gsub!(Regex) { |a| a.upcase }

data/lib/treat/loaders/bind_it.rb ADDED Viewed

@@ -0,0 +1,48 @@
+class Treat::Loaders::BindIt
+  # Keep track of whether its loaded or not.
+  @@loaded = {}
+  # Load CoreNLP package for a given language.
+  def self.load(klass, name, language = nil)
+    return if @@loaded[klass]
+    language ||= Treat.core.language.default
+    jar_path   = Treat.libraries[name].jar_path ||
+                 Treat.paths.bin + "#{name}/"
+    model_path = Treat.libraries[name].model_path ||
+                 Treat.paths.models + "#{name}/"
+    if !File.directory?(jar_path)
+      raise Treat::Exception, "Looking for #{klass} " +
+      "library JAR files in #{jar_path}, but it is " +
+      "not a directory. Please set the config option " +
+      "Treat.libraries.#{name}.jar_path to a folder " +
+      "containing the appropriate JAR files."
+    end
+    if !File.directory?(model_path)
+      raise Treat::Exception, "Looking for #{klass} " +
+      "library model files in #{model_path}, but it " +
+      "is not a directory. Please set the config option " +
+      "Treat.libraries.#{name}.model_path to a folder " +
+      "containing the appropriate JAR files."
+    end
+    klass.jar_path = jar_path
+    klass.model_path = model_path
+    klass.use language
+    if Treat.core.verbosity.silence
+      klass.log_file = '/dev/null'
+    end
+    klass.bind
+    @@loaded[klass] = true
+  end
+end

data/lib/treat/loaders/open_nlp.rb ADDED Viewed

@@ -0,0 +1,12 @@
+require 'treat/loaders/bind_it'
+# A helper class to load the OpenNLP package.
+class Treat::Loaders::OpenNLP < Treat::Loaders::BindIt
+  require 'open-nlp'
+  def self.load(language = nil)
+    super(OpenNLP, :open_nlp, language)
+  end
+end

data/lib/treat/loaders/stanford.rb CHANGED Viewed

@@ -1,53 +1,14 @@
+require 'treat/loaders/bind_it'
 # A helper class to load the CoreNLP package.
-class Treat::Loaders::Stanford
+class Treat::Loaders::Stanford < Treat::Loaders::BindIt
-  # Keep track of whether its loaded or not.
-  @@loaded = false
+  require 'stanford-core-nlp'
-  # Load CoreNLP package for a given language.
   def self.load(language = nil)
-    return if @@loaded
-    language ||= Treat.core.language.default
-    jar_path   = Treat.libraries.stanford.jar_path ||
-                 Treat.paths.bin + 'stanford/'
-    model_path = Treat.libraries.stanford.model_path ||
-                 Treat.paths.models + 'stanford/'
-    if !File.directory?(jar_path)
-      raise Treat::Exception, "Looking for Stanford " +
-      "CoreNLP JAR files in #{jar_path}, but it is " +
-      "not a directory. Please set the config option " +
-      "Treat.libraries.stanford.jar_path to a folder " +
-      "containing the Stanford JAR files."
-    end
-    if !File.directory?(model_path)
-      raise Treat::Exception, "Looking for Stanford " +
-      "CoreNLP model files in #{model_path}, but it " +
-      "is not a directory. Please set the config option " +
-      "Treat.libraries.stanford.model_path to a folder " +
-      "containing the Stanford JAR files."
-    end
-    require 'stanford-core-nlp'
-    StanfordCoreNLP.jar_path = jar_path
-    StanfordCoreNLP.model_path = model_path
-    StanfordCoreNLP.use(language)
-    if Treat.core.verbosity.silence
-      StanfordCoreNLP.log_file = '/dev/null'
-    end
-    StanfordCoreNLP.bind
-    @@loaded = true
+    super(StanfordCoreNLP, :stanford, language)
   end
   def self.find_model(name, language)
     language = language.intern
     model_file = StanfordCoreNLP::Config::Models[name][language]
@@ -57,4 +18,4 @@ class Treat::Loaders::Stanford
     File.join(model_path, model_dir, model_file)
   end
-end
+end

data/lib/treat/proxies/proxy.rb CHANGED Viewed

@@ -11,14 +11,15 @@ module Treat::Proxies
     def method_missing(sym, *args, &block)
       if [:do, :apply].include?(sym) ||
         Treat::Workers.lookup(sym)
-          to_entity.send(sym, *args)
+        to_entity.send(sym, *args)
       else
         super(sym, *args, &block)
       end
     end
     # Create an unknown type of entity by default.
     def to_entity(builder = nil)
-      Treat::Entities::Unknown(self.to_s)
+      Treat::Entities::Unknown.new(self.to_s)
     end
   end

data/lib/treat/version.rb CHANGED Viewed

@@ -1,7 +1,7 @@
 module Treat
   # The current version of Treat.
-  VERSION = "2.0.4"
+  VERSION = "2.0.5"
   # Treat requires Ruby >= 1.9.2
   if RUBY_VERSION < '1.9.2'

data/lib/treat/workers/formatters/readers/autoselect.rb CHANGED Viewed

@@ -3,7 +3,7 @@ class Treat::Workers::Formatters::Readers::Autoselect
   ExtensionRegexp = /^.*?\.([a-zA-Z0-9]{2,5})$/
   ImageExtensions = ['gif', 'jpg', 'jpeg', 'png']
   DefaultOptions = {
-    :default_to => 'txt'
+    :default_to => 'document'
   }
   # Choose a reader to use.

data/lib/treat/workers/formatters/readers/document.rb ADDED Viewed

@@ -0,0 +1,17 @@
+require 'yomu'
+# This class is a wrapper for Yomu.
+# Yomu is a library for extracting text and metadata from files and documents
+# using the Apache Tika content analysis toolkit.
+class Treat::Workers::Formatters::Readers::Document
+  # Extract the readable text from any document.
+  #
+  # Options: none.
+  def self.read(document, options = {})
+    yomu = Yomu.new(document.file)
+    document.value = yomu.text
+    document.set :format, yomu.mimetype.extensions.first
+    document
+  end
+end

data/lib/treat/workers/formatters/unserializers/xml.rb CHANGED Viewed

@@ -65,6 +65,7 @@ class Treat::Workers::Formatters::Unserializers::XML
             value = v
           else
             v = v[1..-1].intern if v[0] == ':'
+            v = ":".intern if v == :''
             v = v.to_i if v =~ /^[0-9]*$/
             v = v.to_f if v =~ /^[0-9\.]*$/
             v = false if v == 'false'

data/lib/treat/workers/groupable.rb CHANGED Viewed

@@ -15,7 +15,7 @@ module Treat::Workers::Groupable
       require file
       if not self.const_defined?(const)
         raise Treat::Exception,
-        "File #{file} does not define " +
+        "File #{file}.rb does not define " +
         "#{self}::#{const}."
       end
       const_get(const)

data/lib/treat/workers/processors/tokenizers/{maxent.rb → open_nlp.rb} RENAMED Viewed

@@ -1,8 +1,10 @@
 # Maximum entropy tokenization supplied by OpenNLP.
-class Treat::Workers::Processors::Tokenizers::Maxent
+class Treat::Workers::Processors::Tokenizers::OpenNlp
   require 'open-nlp'
-  OpenNLP.load
+  Treat::Loaders::OpenNLP.load
+  @@tokenizers = {}
   # Maximum entropy tokenization.
   def self.tokenize(entity, options = {})
@@ -20,8 +22,7 @@ class Treat::Workers::Processors::Tokenizers::Maxent
     tokens = tokenizer.tokenize(str).to_a
     tokens.each do |token|
-      entity << Treat::Entities
-      ::Token.from_string(chunk)
+      entity << Treat::Entities::Token.from_string(token)
     end
   end

data/spec/entities/document.rb CHANGED Viewed

@@ -9,10 +9,10 @@ module Treat::Specs::Entities
           it "opens the file and reads its " +
           "content into a document" do
             f = Treat.paths.spec +
-            'workers/examples/english/mathematicians/leibniz.txt'
+            'workers/examples/english/mathematicians/pythagoras.docx'
             d = Treat::Entities::Document.build(f)
             d.should be_an_instance_of Treat::Entities::Document
-            d.to_s.index('Gottfried Leibniz').should_not eql nil
+            d.to_s.index('Pythagoras of Samos').should_not eql nil
           end
         end

data/spec/helper.rb CHANGED Viewed

@@ -13,6 +13,10 @@ module Treat::Specs
   '/ruby/stanford-core-nlp-minimal/models/'
   Treat.libraries.stanford.jar_path =
   '/ruby/stanford-core-nlp-minimal/bin/'
+  Treat.libraries.open_nlp.jar_path =
+  '/ruby/open-nlp-english/bin/'
+  Treat.libraries.open_nlp.model_path =
+  '/ruby/open-nlp-english/models/'
   Treat.libraries.punkt.model_path =
   '/ruby/punkt/models/'
   Treat.libraries.reuters.model_path =

data/spec/sandbox.rb ADDED Viewed

@@ -0,0 +1,306 @@
+# encoding: utf-8
+require_relative '../lib/treat'
+Treat.databases.mongo.db = 'treat_test'
+Treat.libraries.stanford.model_path =
+'/ruby/stanford-core-nlp-minimal/models/'
+Treat.libraries.stanford.jar_path =
+'/ruby/stanford-core-nlp-minimal/bin/'
+Treat.libraries.punkt.model_path =
+'/ruby/punkt/models/'
+Treat.libraries.reuters.model_path =
+'/ruby/reuters/models/'
+Treat.libraries.open_nlp.jar_path =
+'/ruby/open-nlp-english/bin/'
+Treat.libraries.open_nlp.model_path =
+'/ruby/open-nlp-english/models/'
+Treat.core.verbosity.silence = false
+include Treat::Core::DSL
+s = sentence "This is a sentence to parse!"
+s.tokenize(:open_nlp).parse
+s.print_tree
+=begin
+Treat::Builder.new do
+  p = phrase "26 Feb"
+  p.tokenize.time :kronic
+  puts p.inspect
+  s = sentence "Hello, world!"
+  s2 = sentence "Hello world"
+  puts s.similarity :jaro_winkler, to: s2
+  puts s.distance :levenshtein, to: s2
+  # puts s.similarity :tf_idf, to: s2
+end
+g = group("I was running")
+puts g.tag.inspect
+Treat.libraries.stanford.jar_path = '/ruby/treat/bin/'
+Treat.libraries.stanford.model_path = '/ruby/treat/models/'
+p = paragraph
+s = sentence
+w = word
+p = phrase 'hello world'
+e = email 'louis@gmail.com'
+d = question(:is_feature, :word)
+=end
+#d = document Treat.paths.spec + 'workers/examples/english/economist/hungarys_troubles.txt'
+#d.apply :chunk, :segment, :tokenize, :tag, :category, :name_tag
+#d.print_tree
+#d = document Treat.paths.spec + 'workers/examples/english/economist/saving_the_euro.odt'
+#d.print_tree
+=begin
+d = document 'test.htm'
+d.apply :chunk
+#d.serialize :yaml, file: 'test444.yaml'
+d.set :test, 2
+d.serialize :mongo, db: 'test'
+d.set :test, 3
+d.serialize :mongo, db: 'test'
+d.apply :segment, :tokenize, :tag, :category
+puts d.verb_count
+#d2 = document id: d.id, db: 'test'
+d2 = document 'features.test' => 3, db: 'test'
+d2.apply :segment, :tokenize, :tag, :category
+puts d2.verb_count
+#d.print_tree
+#s = document 'http://www.economist.com'
+p = phrase 'hello', 'world', '!'
+puts p.to_s
+puts p.to_str
+=end
+=begin
+### Super basics.
+puts p.value
+p << 'bitch'
+p << word('hello')
+puts p.to_s
+puts p.to_str
+puts p.value
+puts p.to_ary.inspect
+=end
+=begin
+### Configuration
+# A boolean value indicating whether to silence the output of external libraries (e.g. Stanford tools, Enju, LDA, Ruby-FANN) when they are used.
+puts Treat.core.verbosity.silence
+# A boolean value indicating whether to explain the steps that Treat is performing.
+puts Treat.core.verbosity.debug
+# A boolean value indicating whether Treat should try to detect the language of newly input text.
+puts Treat.core.language.detect
+# The language to default to when detection is off.
+puts Treat.core.language.default
+# A symbol representing the finest level at which language detection should be performed if language detection is turned on.
+puts Treat.core.language.detect_at
+# A directory in which to create temporary files.
+puts Treat.paths.tmp
+# A directory in which to store downloaded files.
+puts Treat.paths.files
+# A directory containing trained models for various tasks.
+puts Treat.paths.models
+# A directory containing the spec files.
+puts Treat.paths.spec
+# A directory containing executables and JAR files.
+puts Treat.paths.bin
+puts Treat.paths.lib
+# Set up Mongoid.
+Treat.databases.mongo.db = 'your_database'
+Treat.databases.mongo.host = 'localhost'
+Treat.databases.mongo.port = '27017'
+# Transparent string casting.
+s = 'inflection'.stem
+# is equivalent to
+s = 'inflection'.to_entity.stem
+# which comes down to
+s = word('inflection').stem
+# Transparent number casting.
+n = 2.ordinal
+# is equivalent to
+s = 2.to_entity.ordinal
+# which comes down to
+s = number(2).ordinal
+=end
+=begin
+### BASIC USAGE
+# Create a sentence
+s = sentence 'Those who dream by day know of at least ' +
+'19 things that escape those who dream only at night.'
+# Tokenize and tag it.
+s.tokenize.tag
+# View the sentence structure.
+s.print_tree
+# Iterate over the tokens.
+s.each_token do |tok|
+  puts tok.value
+  puts tok.type
+end
+# Arrays instead of iterators.
+(s.nouns + s.adjectives).each do |word|
+  puts word.synonyms
+  puts word.antonyms
+end
+# Functions on numbers.
+s.each_number do |num|
+  puts num.ordinal
+  puts num.cardinal
+end
+# See all the annotations.
+s.each do |tok|
+  puts tok.inspect
+end
+# Lazy way of doing all of the above.
+s = sentence 'Those who dream by day know of at least ' +
+'19 things that escape those who dream only at night.'
+s.apply :tokenize, :tag, :category,
+        :stem, :hyponyms, :hypernyms,
+        :antonyms, :ordinal, :cardinal
+=end
+=begin
+### A BIT MORE ADVANCED USAGE
+section = section "Obama-Sarkozy Meeting\n" +
+"Obama and Sarkozy met on January 1st to investigate " +
+"the possibility of a new rescue plan. President " +
+"Sarkozy is to meet Merkel next Tuesday in Berlin."
+# Chunk: split the titles and paragraphs.
+# Segment: perform sentence segmentation.
+# Parse: parse the syntax of each sentence.
+section.apply :chunk, :segment, :parse
+# View the tree structure.
+section.print_tree
+# Get some basic info on the text.
+puts section.title
+puts section.sentence_count
+puts section.word_count
+section.apply :category
+puts section.noun_count
+puts section.frequency_of 'president'
+section.each_phrase_with_tag('NP') do |phrase|
+  puts phrase.to_s
+end
+=end
+=begin
+### URL documents, XML serialization.
+urls = ['http://www.cbc.ca/news/world/story/2012/11/25/snc-lavalin-ben-aissa-charges.html',
+'http://www.cbc.ca/news/world/story/2012/11/25/egypt.html', 'http://www.cbc.ca/news/canada/prince-edward-island/story/2012/11/25/pei-murder-arrest-stlucia.html', 'http://www.cbc.ca/news/world/story/2012/11/25/bangladesh-garment-factory-fire.html']
+c = collection
+urls.each { |url| c << document(url) }
+# View the collection.
+c.print_tree
+c.apply :chunk, :segment, :tokenize
+c.serialize :xml, :file => 'test.xml'
+# Reopen the collection.
+c = collection 'test.xml'
+# View it again.
+c.print_tree
+=end
+=begin
+include Treat::Core::DSL
+# Show progress bars for download.
+Treat.core.verbosity.silence = false
+# Explain what Treat is doing.
+Treat.core.verbosity.debug = true
+# Define the question "is it junk?" on sentences.
+qn = question(:is_junk, :sentence)
+# Frame the problem as depending on punctuation
+# count and word count for each sentence.
+pb = problem(qn,
+     feature(:punctuation_count),
+     feature(:word_count) )
+# Get some web documents to work on.
+url1 = 'http://en.wikipedia.org/wiki/NOD_mouse'
+url2 = 'http://en.wikipedia.org/wiki/Academic_studies_about_Wikipedia'
+d1, d2 = document(url1), document(url2)
+# Process both of our documents.
+[d1,d2].apply(:chunk, :segment, :tokenize)
+# Answer our problem to create a training set.
+d1.sentences[0..17].each { |s| s.set :is_junk, 0 }
+d1.sentences[17..-1].each { |s| s.set :is_junk, 1 }
+d_set = d1.export(pb)
+# Define our gold standard results for evaluation.
+d2.sentences[0..81].each { |s| s.set :is_true_junk, 0 }
+d2.sentences[81..-1].each { |s| s.set :is_true_junk, 1 }
+tp, fp, tn, fn = 0.0, 0.0, 0.0, 0.0
+d2.sentences.map do |s|
+  pred = s.classify(:id3, training: d_set)
+  if pred == 1
+    tp += 1 if s.is_true_junk == 1
+    fp += 1 if s.is_true_junk == 0
+  else
+    tn += 1 if s.is_true_junk == 0
+    fn += 1 if s.is_true_junk == 1
+  end
+end
+puts "Precision: #{tp/(tp + fp)}"
+puts "Recall: #{tp/(tp + fn)}"
+=end
+=begin
+d = document 'http://louismullie.com/susan-text-scan1.jpg'
+d.apply :chunk, :segment, :tokenize
+d.print_tree
+=end
+=begin
+# Syntax example
+phra = phrase 'Obama', 'Sarkozy', 'Meeting'
+para  = paragraph 'Obama and Sarkozy met on January 1st to'
+'investigate the possibility of a new rescue plan. Nicolas ' +
+'Sarkozy is to meet Merkel next Tuesday in Berlin.'
+sect = section title(phra), para
+=end
+=begin
+puts "beer".plural.inspect
+=end
+# Treat.core.language.detect = true
+# s = sentence "Du hast deiner Frau einen roten Ring gekauft."
+#s.apply(:parse,:category).print_tree