RubyGems - treat - Versions diffs - 2.0.2 → 2.0.3 - Mend

treat 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

data/files/21552208.html +786 -0
data/files/nethttp-cheat-sheet-2940.html +393 -0
data/lib/treat/builder.rb +6 -0
data/lib/treat/config/data/languages/agnostic.rb +2 -2
data/lib/treat/core/server.rb +1 -0
data/lib/treat/entities/entity/buildable.rb +1 -1
data/lib/treat/loaders/linguistics.rb +6 -7
data/lib/treat/loaders/stanford.rb +45 -11
data/lib/treat/version.rb +1 -1
data/lib/treat/workers/categorizable.rb +30 -32
data/lib/treat/workers/extractors/name_tag/stanford.rb +8 -24
data/lib/treat/workers/formatters/readers/html.rb +1 -1
data/lib/treat/workers/formatters/readers/xml.rb +1 -1
data/lib/treat/workers/formatters/unserializers/mongo.rb +1 -1
data/lib/treat/workers/groupable.rb +2 -1
data/lib/treat/workers/inflectors/cardinalizers/linguistics.rb +3 -3
data/lib/treat/workers/inflectors/conjugators/linguistics.rb +6 -4
data/lib/treat/workers/inflectors/declensors/linguistics.rb +11 -18
data/lib/treat/workers/inflectors/ordinalizers/linguistics.rb +4 -4
data/lib/treat/workers/lexicalizers/sensers/wordnet.rb +1 -1
data/lib/treat/workers/lexicalizers/taggers/stanford.rb +23 -21
data/lib/treat/workers/processors/parsers/stanford.rb +10 -20
data/lib/treat/workers/processors/segmenters/stanford.rb +1 -1
data/lib/treat/workers/processors/tokenizers/maxent.rb +29 -0
data/lib/treat/workers/processors/tokenizers/stanford.rb +2 -4
data/lib/treat.rb +1 -0
data/spec/helper.rb +8 -6
data/spec/sandbox.rb +18 -6
data/spec/workers/agnostic.rb +76 -29
data/spec/workers/english.rb +23 -73
data/spec/workers/examples/english/economist/saving_the_euro.odt +0 -0
metadata +6 -18

data/spec/workers/agnostic.rb CHANGED Viewed

@@ -1,44 +1,91 @@
-$workers = Treat.languages.agnostic.workers
+class Treat::Specs::Workers::Agnostic
+  @@workers = Treat.languages.agnostic.workers
-describe Treat::Workers::Extractors::Language do
-  before do
-    @entities = ["Obama and Sarkozy will meet in Berlin."]
-    @languages = ["english"]
-  end
-  context "when called on any textual entity" do
-    it "returns the language of the entity" do
-      # Treat.core.language.detect = true
-      $workers.extractors.language.each do |extractor|
-        @entities.map(&:language).should eql @languages
+  describe Treat::Workers::Extractors::Language do
+    before do
+      @entities = ["Obama and Sarkozy will meet in Berlin."]
+      @languages = ["english"]
+    end
+    context "when called on any textual entity" do
+      it "returns the language of the entity" do
+        # Treat.core.language.detect = true
+        @@workers.extractors.language.each do |extractor|
+          @entities.map(&:language).should eql @languages
+        end
+        # Treat.core.language.detect = false
       end
-      # Treat.core.language.detect = false
     end
   end
-end
-describe Treat::Workers::Formatters::Serializers do
-  before do
-    @texts = ["A test entity"]
-  end
-  context "when #serialize is called on any textual entity" do
-    it "serializes the entity to disk and returns a pointer to the location" do
-      # m = Treat::Entities::Entity.build
-      @texts.map(&:to_entity).map(&:serialize)
-      .map(&method(:entity)).map(&:to_s).should eql @texts
+  describe Treat::Workers::Extractors::TopicWords do
+    before do
+      @collections = ["./spec/workers/examples/english/economist"]
+      @topic_words = [["euro", "zone", "european", "mrs", "greece", "chancellor",
+      "berlin", "practice", "german", "germans"], ["bank", "minister", "central",
+      "bajnai", "mr", "hu", "orban", "commission", "hungarian", "government"],
+      ["bank", "mr", "central", "bajnai", "prime", "government", "brussels",
+      "responsibility", "national", "independence"], ["mr", "bank", "central",
+      "policies", "prime", "minister", "today", "financial", "government", "funds"],
+      ["euro", "merkel", "mr", "zone", "european", "greece", "german", "berlin",
+      "sarkozy", "government"], ["mr", "bajnai", "today", "orban", "government",
+      "forced", "independence", "part", "hand", "minister"], ["sarkozy", "mrs",
+      "zone", "euro", "fiscal", "called", "greece", "merkel", "german", "financial"],
+      ["mr", "called", "central", "policies", "financial", "bank", "european",
+      "prime", "minister", "shift"], ["bajnai", "orban", "prime", "mr", "government",
+      "independence", "forced", "commission", "-", "hvg"], ["euro", "sarkozy", "fiscal",
+      "merkel", "mr", "chancellor", "european", "german", "agenda", "soap"], ["mr",
+        "bank", "called", "central", "today", "prime", "government", "minister", "european",
+      "crisis"], ["mr", "fiscal", "mrs", "sarkozy", "merkel", "euro", "summit", "tax",
+      "leaders", "ecb"], ["called", "government", "financial", "policies", "part", "bank",
+      "central", "press", "mr", "president"], ["sarkozy", "merkel", "euro", "mr", "summit",
+      "mrs", "fiscal", "merkozy", "economic", "german"], ["mr", "prime", "minister",
+      "policies", "government", "financial", "crisis", "bank", "called", "part"], ["mr",
+        "bank", "government", "today", "called", "central", "minister", "prime", "issues",
+      "president"], ["mr", "orban", "central", "government", "parliament", "hungarian",
+      "minister", "hu", "personal", "bajnai"], ["government", "called", "central", "european",
+      "today", "bank", "prime", "financial", "part", "deficit"], ["mr", "orban", "government",
+      "hungarian", "bank", "hvg", "minister", "-", "fidesz", "hand"], ["mr", "bank", "european",
+      "minister", "policies", "crisis", "government", "president", "called", "shift"]]
+    end
+    context "when #topic_words is called on a chunked, segmented and tokenized collection" do
+      it "annotates the collection with the topic words and returns them" do
+        @@workers.extractors.topic_words.each do |extractor|
+          @collections.map(&method(:collection))
+          .map { |col| col.apply(:chunk,:segment,:tokenize) }
+          map { |col| col.topic_words }.should eql @topic_words
+        end
+      end
     end
   end
-end
-describe Treat::Workers::Formatters::Unserializers do
-  before do
-    @texts = ["A te"]
+  describe Treat::Workers::Formatters::Serializers do
+    before do
+      @texts = ["A test entity"]
+    end
+    context "when #serialize is called on any textual entity" do
+      it "serializes the entity to disk and returns a pointer to the location" do
+        # m = Treat::Entities::Entity.build
+        @texts.map(&:to_entity).map(&:serialize)
+        .map(&method(:entity)).map(&:to_s).should eql @texts
+      end
+    end
   end
-  context "when #unserialize is called with a selector on any textual entity" do
-    it "unserializes the file and loads it in the entity" do
+  describe Treat::Workers::Formatters::Unserializers do
+    before do
+      @texts = ["A te"]
+    end
+    context "when #unserialize is called with a selector on any textual entity" do
+      it "unserializes the file and loads it in the entity" do
+      end
     end
   end
 end
 =begin
 visualize: {
   entity: {

data/spec/workers/english.rb CHANGED Viewed

@@ -1,18 +1,11 @@
 require 'rspec'
 require_relative '../../lib/treat'
-include Treat::Core::DSL
-=begin
-Treat.libraries.stanford.model_path = '/ruby/stanford/stanford-core-nlp-all/'
-Treat.libraries.stanford.jar_path = '/ruby/stanford/stanford-core-nlp-all/'
-Treat.libraries.punkt.model_path = '/ruby/punkt/'
-Treat.libraries.reuters.model_path = '/ruby/reuters/'
-=end
+class Treat::Specs::Workers::English
-class English
+  @@workers = Treat.languages.english.workers
-  $workers = Treat.languages.english.workers
   Treat.core.language.default = 'english'
   Treat.core.language.detect  = false
@@ -29,7 +22,7 @@ class English
     context "when #segment is called on a zone" do
       it "segments the zone into groups" do
-        $workers.processors.segmenters.each do |segmenter|
+        @@workers.processors.segmenters.each do |segmenter|
           @zones.map { |zone| zone.segment(segmenter) }
           .map { |zone| zone.groups.map(&:to_s) }
           .should eql @groups
@@ -72,7 +65,7 @@ class English
     end
     context "when #tokenize is called on a group" do
       it "separates the group into tokens" do
-        $workers.processors.tokenizers.each do |tokenizer|
+        @@workers.processors.tokenizers.each do |tokenizer|
           @groups.dup.map { |text| group(text).tokenize(tokenizer) }
           .map { |group| group.tokens.map(&:to_s) }
           .should eql @tokens
@@ -88,7 +81,7 @@ class English
     end
     context "when #parse is called on a group" do
       it "tokenizes and parses the group into its syntactical phrases" do
-        $workers.processors.parsers.each do |parser|
+        @@workers.processors.parsers.each do |parser|
           @groups.dup.map { |text| group(text).parse(parser) }
           .map { |group| group.phrases.map(&:to_s)}
           .should eql @phrases
@@ -106,7 +99,7 @@ class English
     end
     context "when #tag is is called on a tokenized group" do
       it "annotates each token in the group with its tag and returns the tag 'G'" do
-        $workers.lexicalizers.taggers.each do |tagger|
+        @@workers.lexicalizers.taggers.each do |tagger|
           @groups.map { |txt| group(txt).tag(tagger) }
           .all? { |tag| tag == 'G' }.should be_true
           @groups.map { |txt| group(txt).tokenize }
@@ -117,7 +110,7 @@ class English
     end
     context "when #tag is called on a token" do
       it "annotates the token with its tag and returns it" do
-        $workers.lexicalizers.taggers.each do |tagger|
+        @@workers.lexicalizers.taggers.each do |tagger|
           @tokens.map { |tok| token(tok).tag(tagger)  }
           .should eql @token_tags
         end
@@ -186,7 +179,7 @@ class English
     context "when #synonym is called on a word, or #sense is "+
     "called on a word with option :nym set to 'hyponyms'" do
       it "returns the hyponyms of the word" do
-        $workers.lexicalizers.sensers.each do |senser|
+        @@workers.lexicalizers.sensers.each do |senser|
           @words.map { |txt| word(txt) }
           .map { |wrd| wrd.hyponyms(senser) }.should eql @hyponyms
           @words.map { |txt| word(txt) }
@@ -199,7 +192,7 @@ class English
     context "when #hypernyms is called on a word or #sense is "+
     "called on a word with option :nym set to 'hyponyms'" do
       it "returns the hyponyms of the word" do
-        $workers.lexicalizers.sensers.each do |senser|
+        @@workers.lexicalizers.sensers.each do |senser|
           @words.map { |txt| word(txt) }
           .map { |wrd| wrd.hypernyms(senser) }.should eql @hypernyms
           @words.map { |txt| word(txt) }
@@ -212,7 +205,7 @@ class English
     context "when #antonyms is called on a word or #sense is" +
     "called on a word with option :nym set to 'antonyms'" do
       it "returns the hyponyms of the word" do
-        $workers.lexicalizers.sensers.each do |senser|
+        @@workers.lexicalizers.sensers.each do |senser|
           @words.map { |txt| word(txt) }
           .map { |wrd| wrd.antonyms(senser) }.should eql @antonyms
           @words.map { |txt| word(txt) }
@@ -225,7 +218,7 @@ class English
     context "when #synonyms is called on a word or #sense is" +
     "called on a word with option :nym set to 'synonyms'" do
       it "returns the hyponyms of the word" do
-        $workers.lexicalizers.sensers.each do |senser|
+        @@workers.lexicalizers.sensers.each do |senser|
           @words.map { |txt| word(txt) }
           .map { |wrd| wrd.synonyms(senser) }.should eql @synonyms
           @words.map { |txt| word(txt) }
@@ -251,7 +244,7 @@ class English
     context "when #category is called on a tokenized and tagged group" do
       it "returns a tag corresponding to the group name" do
-        $workers.lexicalizers.categorizers.each do |categorizer|
+        @@workers.lexicalizers.categorizers.each do |categorizer|
           [phrase(@phrase), fragment(@fragment), sentence(@sentence)]
           .map { |grp| grp.apply(:tag).category(categorizer) }
           .should eql @group_categories
@@ -261,7 +254,7 @@ class English
     context "when #category is called called on a tagged token" do
       it "returns the category corresponding to the token's tag" do
-        $workers.lexicalizers.categorizers.each do |categorizer|
+        @@workers.lexicalizers.categorizers.each do |categorizer|
           @tokens.map { |tok| token(tok).apply(:tag).category(categorizer) }
           .should eql @token_tags
         end
@@ -281,7 +274,7 @@ class English
     context "when #ordinal is called on a number" do
       it "returns the ordinal form (e.g. 'first') of the number" do
-        $workers.inflectors.ordinalizers.each do |ordinalizer|
+        @@workers.inflectors.ordinalizers.each do |ordinalizer|
           @numbers.map { |num| number(num) }
           .map { |num| num.ordinal(ordinalizer) }.should eql @ordinal
         end
@@ -290,7 +283,7 @@ class English
     context "when #cardinal is called on a number" do
       it "returns the cardinal form (e.g. 'second' of the number)" do
-        $workers.inflectors.cardinalizers.each do |cardinalizer|
+        @@workers.inflectors.cardinalizers.each do |cardinalizer|
           @numbers.map { |num| number(num) }
           .map { |num| num.cardinal(cardinalizer) }.should eql @cardinal
         end
@@ -306,7 +299,7 @@ class English
     end
     context "when #stem is called on a word" do
       it "annotates the word with its stem and returns the stem" do
-        $workers.inflectors.stemmers.each do |stemmer|
+        @@workers.inflectors.stemmers.each do |stemmer|
           @words.map { |wrd| wrd.stem(stemmer) }.should eql @stems
         end
       end
@@ -321,7 +314,7 @@ class English
     context "when #name_tag called on a tokenized group" do
       it "tags each token with its name tag" do
-        $workers.extractors.name_tag.each do |tagger|
+        @@workers.extractors.name_tag.each do |tagger|
           @groups.map { |grp| grp.tokenize.apply(:name_tag) }
           .map { |grp| grp.tokens.map { |t| t.get(:name_tag) } }
           .should eql @tags
@@ -339,7 +332,7 @@ class English
     end
     context "when #topics is called on a chunked, segmented and tokenized document" do
       it "annotates the document with its general topics and returns them" do
-        $workers.extractors.topics.each do |extractor|
+        @@workers.extractors.topics.each do |extractor|
           @files.map { |f| document(f).apply(:chunk, :segment, :tokenize) }
           .map { |doc| doc.topics }.should eql @topics
         end
@@ -354,7 +347,7 @@ class English
     end
     context "when called on a tokenized group representing a time expression" do
       it "returns the DateTime object corresponding to the time" do
-        $workers.extractors.time.each do |extractor|
+        @@workers.extractors.time.each do |extractor|
           puts @expressions.map(&:time).inspect
           @expressions.map(&:time).all? { |time| time
           .is_a?(DateTime) }.should be_true
@@ -365,49 +358,6 @@ class English
     end
   end
-  describe Treat::Workers::Extractors::TopicWords do
-    before do
-      @collections = ["./spec/workers/examples/english/economist"]
-      @topic_words = [["euro", "zone", "european", "mrs", "greece", "chancellor",
-      "berlin", "practice", "german", "germans"], ["bank", "minister", "central",
-      "bajnai", "mr", "hu", "orban", "commission", "hungarian", "government"],
-      ["bank", "mr", "central", "bajnai", "prime", "government", "brussels",
-      "responsibility", "national", "independence"], ["mr", "bank", "central",
-      "policies", "prime", "minister", "today", "financial", "government", "funds"],
-      ["euro", "merkel", "mr", "zone", "european", "greece", "german", "berlin",
-      "sarkozy", "government"], ["mr", "bajnai", "today", "orban", "government",
-      "forced", "independence", "part", "hand", "minister"], ["sarkozy", "mrs",
-      "zone", "euro", "fiscal", "called", "greece", "merkel", "german", "financial"],
-      ["mr", "called", "central", "policies", "financial", "bank", "european",
-      "prime", "minister", "shift"], ["bajnai", "orban", "prime", "mr", "government",
-      "independence", "forced", "commission", "-", "hvg"], ["euro", "sarkozy", "fiscal",
-      "merkel", "mr", "chancellor", "european", "german", "agenda", "soap"], ["mr",
-        "bank", "called", "central", "today", "prime", "government", "minister", "european",
-      "crisis"], ["mr", "fiscal", "mrs", "sarkozy", "merkel", "euro", "summit", "tax",
-      "leaders", "ecb"], ["called", "government", "financial", "policies", "part", "bank",
-      "central", "press", "mr", "president"], ["sarkozy", "merkel", "euro", "mr", "summit",
-      "mrs", "fiscal", "merkozy", "economic", "german"], ["mr", "prime", "minister",
-      "policies", "government", "financial", "crisis", "bank", "called", "part"], ["mr",
-        "bank", "government", "today", "called", "central", "minister", "prime", "issues",
-      "president"], ["mr", "orban", "central", "government", "parliament", "hungarian",
-      "minister", "hu", "personal", "bajnai"], ["government", "called", "central", "european",
-      "today", "bank", "prime", "financial", "part", "deficit"], ["mr", "orban", "government",
-      "hungarian", "bank", "hvg", "minister", "-", "fidesz", "hand"], ["mr", "bank", "european",
-      "minister", "policies", "crisis", "government", "president", "called", "shift"]]
-    end
-    context "when #topic_words is called on a chunked, segmented and tokenized collection" do
-      it "annotates the collection with the topic words and returns them" do
-        $workers.extractors.topic_words.each do |extractor|
-          @collections.map(&method(:collection))
-          .map { |col| col.apply(:chunk,:segment,:tokenize) }
-          map { |col| col.topic_words }.should eql @topic_words
-        end
-      end
-    end
-  end
   describe Treat::Workers::Inflectors::Conjugators do
     before do
       @infinitives = ["run"]
@@ -417,7 +367,7 @@ class English
     context "when #present_participle is called on a word or #conjugate " +
     "is called on a word with option :form set to 'present_participle'" do
       it "returns the present participle form of the verb" do
-        $workers.inflectors.conjugators.each do |conjugator|
+        @@workers.inflectors.conjugators.each do |conjugator|
           @participles.map { |verb| verb
           .infinitive(conjugator) }
           .should eql @infinitives
@@ -431,7 +381,7 @@ class English
     context "when #infinitive is called on a word or #conjugate is " +
     "called on a word with option :form set to 'infinitive'" do
       it "returns the infinitive form of the verb" do
-        $workers.inflectors.conjugators.each do |conjugator|
+        @@workers.inflectors.conjugators.each do |conjugator|
           @infinitives.map { |verb| verb
           .present_participle(conjugator) }
           .should eql @participles
@@ -452,7 +402,7 @@ class English
     context "when #plural is called on a word, or #declense "+
     "is called on a word with option :count set to 'plural'" do
       it "returns the plural form of the word" do
-        $workers.inflectors.declensors.each do |declensor|
+        @@workers.inflectors.declensors.each do |declensor|
           @singulars.map { |word| word.plural(declensor) }
           .should eql @plurals
           @singulars.map { |word| word
@@ -464,7 +414,7 @@ class English
     context "when #singular is called on a word, or #declense " +
     "is called on a word with option :count set to 'singular'" do
       it "returns the singular form of the word" do
-        $workers.inflectors.declensors.each do |declensor|
+        @@workers.inflectors.declensors.each do |declensor|
           next if declensor == :linguistics
           @plurals.map { |word| word.singular(declensor) }
           .should eql @singulars

data/spec/workers/examples/english/economist/saving_the_euro.odt CHANGED Viewed

Binary file

metadata CHANGED Viewed

@@ -1,7 +1,7 @@
 --- !ruby/object:Gem::Specification
 name: treat
 version: !ruby/object:Gem::Version
-  version: 2.0.2
+  version: 2.0.3
   prerelease:
 platform: ruby
 authors:
@@ -9,7 +9,7 @@ authors:
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2012-12-15 00:00:00.000000000 Z
+date: 2013-01-04 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: schiphol
@@ -75,22 +75,6 @@ dependencies:
     - - ! '>='
       - !ruby/object:Gem::Version
         version: '0'
-- !ruby/object:Gem::Dependency
-  name: terminal-table
-  requirement: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
-  type: :development
-  prerelease: false
-  version_requirements: !ruby/object:Gem::Requirement
-    none: false
-    requirements:
-    - - ! '>='
-      - !ruby/object:Gem::Version
-        version: '0'
 - !ruby/object:Gem::Dependency
   name: simplecov
   requirement: !ruby/object:Gem::Requirement
@@ -116,6 +100,7 @@ extra_rdoc_files: []
 files:
 - bin/MANIFEST
 - lib/treat/autoload.rb
+- lib/treat/builder.rb
 - lib/treat/config/config.rb
 - lib/treat/config/configurable.rb
 - lib/treat/config/data/core.rb
@@ -244,6 +229,7 @@ files:
 - lib/treat/workers/processors/segmenters/srx.rb
 - lib/treat/workers/processors/segmenters/stanford.rb
 - lib/treat/workers/processors/segmenters/tactful.rb
+- lib/treat/workers/processors/tokenizers/maxent.rb
 - lib/treat/workers/processors/tokenizers/ptb.rb
 - lib/treat/workers/processors/tokenizers/punkt.rb
 - lib/treat/workers/processors/tokenizers/stanford.rb
@@ -278,7 +264,9 @@ files:
 - spec/workers/examples/english/test.txt
 - models/MANIFEST
 - tmp/MANIFEST
+- files/21552208.html
 - files/MANIFEST
+- files/nethttp-cheat-sheet-2940.html
 - README.md
 - LICENSE
 homepage: https://github.com/louismullie/treat