RubyGems - rwordnet2 - Versions diffs - 2.0.1 - Mend

rwordnet2 2.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (37) hide show

checksums.yaml +7 -0
data/History.txt +21 -0
data/README.markdown +76 -0
data/WordNet-3.0/AUTHORS +6 -0
data/WordNet-3.0/COPYING +31 -0
data/WordNet-3.0/LICENSE +31 -0
data/WordNet-3.0/README +101 -0
data/WordNet-3.0/dict/data.adj +18185 -0
data/WordNet-3.0/dict/data.adv +3650 -0
data/WordNet-3.0/dict/data.noun +82144 -0
data/WordNet-3.0/dict/data.verb +13796 -0
data/WordNet-3.0/dict/index.adj +21508 -0
data/WordNet-3.0/dict/index.adv +4510 -0
data/WordNet-3.0/dict/index.noun +117827 -0
data/WordNet-3.0/dict/index.verb +11558 -0
data/examples/benchmark.rb +14 -0
data/examples/dictionary.rb +20 -0
data/examples/full_hypernym.rb +9 -0
data/examples/morphy.rb +20 -0
data/examples/synset_find.rb +8 -0
data/lib/rwordnet/db.rb +25 -0
data/lib/rwordnet/lemma.rb +87 -0
data/lib/rwordnet/pointer.rb +32 -0
data/lib/rwordnet/pointers.rb +82 -0
data/lib/rwordnet/synset.rb +286 -0
data/lib/rwordnet/version.rb +3 -0
data/lib/rwordnet.rb +5 -0
data/morphy/exceptions/adj.exc +1490 -0
data/morphy/exceptions/adv.exc +7 -0
data/morphy/exceptions/noun.exc +2054 -0
data/morphy/exceptions/verb.exc +2401 -0
data/test/test_helper.rb +35 -0
data/test/unit/db_test.rb +14 -0
data/test/unit/lemma_test.rb +94 -0
data/test/unit/pointer_test.rb +26 -0
data/test/unit/synset_test.rb +83 -0
metadata +79 -0

data/examples/benchmark.rb ADDED Viewed

@@ -0,0 +1,14 @@
+require 'benchmark'
+require 'rwordnet'
+initial = Benchmark.realtime do
+  WordNet::Lemma.find(ARGV[0] || raise("Usage: ruby benchmark.rb noun"), :noun)
+end
+puts "Time to initial word #{initial}"
+lookup = Benchmark.realtime do
+  1000.times { WordNet::Lemma.find('fruit', :noun) }
+end
+puts "Time for 1k lookups #{lookup}"

data/examples/dictionary.rb ADDED Viewed

@@ -0,0 +1,20 @@
+# Use WordNet as a command-line dictionary.
+require 'rwordnet'
+if ARGV.size != 1
+  puts "Usage: ruby dictionary.rb word"
+  exit(1)
+end
+word = ARGV[0]
+# Find all the lemmas for a word (i.e., whether it occurs as a noun, verb, etc.)
+lemmas = WordNet::Lemma.find_all(word)
+# Print out each lemma with a list of possible meanings.
+lemmas.each do |lemma|
+  puts lemma
+  lemma.synsets.each_with_index do |synset,i|
+    puts "\t#{i+1}) #{synset.gloss}"
+  end
+end

data/examples/full_hypernym.rb ADDED Viewed

@@ -0,0 +1,9 @@
+require 'rwordnet'
+# Find the word 'dog'
+lemma = WordNet::Lemma.find("dog", :noun)
+# Find all the synsets for 'dog', and pick the first one.
+synset = lemma.synsets[0]
+puts synset
+# Print the full hypernym derivation for the first sense of 'dog'.
+synset.expanded_hypernyms.each { |d| puts d }

data/examples/morphy.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'rwordnet'
+puts 'dogs'
+puts '--------------'
+puts 'as noun'
+p WordNet::Synset.morphy('dogs', 'noun')
+puts 'as verb'
+p WordNet::Synset.morphy('dogs', 'verb')
+puts ''
+puts 'hiking'
+puts '--------------'
+puts 'as noun'
+p WordNet::Synset.morphy('hiking', 'noun')
+puts 'as verb'
+p WordNet::Synset.morphy('hiking', 'verb')
+puts 'as all'
+p WordNet::Synset.morphy_all('hiking')

data/examples/synset_find.rb ADDED Viewed

@@ -0,0 +1,8 @@
+require 'rwordnet'
+puts 'hiking'
+WordNet::Synset.find_all('hiking').each{|d| puts d}
+puts''
+puts 'dogs'
+WordNet::Synset.find_all('dogs').each{|d| puts d}

data/lib/rwordnet/db.rb ADDED Viewed

@@ -0,0 +1,25 @@
+module WordNet
+  # Represents the WordNet database, and provides some basic interaction.
+  class DB
+    # By default, use the bundled WordNet
+    @path = File.expand_path("../../../WordNet-3.0/", __FILE__)
+    class << self; attr_accessor :cached end
+    @raw_wordnet = {}
+    class << self
+      # To use your own WordNet installation (rather than the one bundled with rwordnet:
+      # Returns the path to the WordNet installation currently in use. Defaults to the bundled version of WordNet.
+      attr_accessor :path
+      # Open a wordnet database. You shouldn't have to call this directly; it's
+      # handled by the autocaching implemented in lemma.rb.
+      #
+      # `path` should be a string containing the absolute path to the root of a
+      # WordNet installation.
+      def open(path, &block)
+        File.open(File.join(self.path, path), "r", &block)
+      end
+    end
+  end
+end

data/lib/rwordnet/lemma.rb ADDED Viewed

@@ -0,0 +1,87 @@
+module WordNet
+  # Represents a single word in the WordNet lexicon, which can be used to look up a set of synsets.
+  class Lemma
+    SPACE = ' '
+    POS_SHORTHAND = {:v => :verb, :n => :noun, :a => :adj, :r => :adv}
+    # The word this lemma represents
+    attr_accessor :word
+    # The part of speech (noun, verb, adjective) of this lemma. One of 'n', 'v', 'a' (adjective), or 'r' (adverb)
+    attr_accessor :pos
+    # The number of times the sense is tagged in various semantic concordance texts. A tagsense_count of 0 indicates that the sense has not been semantically tagged.
+    attr_accessor :tagsense_count
+    # The offset, in bytes, at which the synsets contained in this lemma are stored in WordNet's internal database.
+    attr_accessor :synset_offsets
+    # A unique integer id that references this lemma. Used internally within WordNet's database.
+    attr_accessor :id
+    # An array of valid pointer symbols for this lemma. The list of all valid
+    # pointer symbols is defined in pointers.rb.
+    attr_accessor :pointer_symbols
+    # Create a lemma from a line in an lexicon file. You should not be creating Lemmas by hand; instead,
+    # use the WordNet::Lemma.find and WordNet::Lemma.find_all methods to find the Lemma for a word.
+    def initialize(lexicon_line, id)
+      @id = id
+      line = lexicon_line.split(" ")
+      @word = line.shift
+      @pos = line.shift
+      synset_count = line.shift.to_i
+      @pointer_symbols = line.slice!(0, line.shift.to_i)
+      line.shift # Throw away redundant sense_cnt
+      @tagsense_count = line.shift.to_i
+      @synset_offsets = line.slice!(0, synset_count).map(&:to_i)
+    end
+    # Return a list of synsets for this Lemma. Each synset represents a different sense, or meaning, of the word.
+    def synsets
+      @synset_offsets.map { |offset| Synset.new(@pos, offset) }
+    end
+    # Returns a compact string representation of this lemma, e.g. "fall, v" for
+    # the verb form of the word "fall".
+    def to_s
+      [@word, @pos].join(",")
+    end
+    class << self
+      @@cache = {}
+      # Find all lemmas for this word across all known parts of speech
+      def find_all(word)
+        [:noun, :verb, :adj, :adv].flat_map do |pos|
+          find(word, pos) || []
+        end
+      end
+      # Find a lemma for a given word and pos. Valid parts of speech are:
+      # 'adj', 'adv', 'noun', 'verb'. Additionally, you can use the shorthand
+      # forms of each of these ('a', 'r', 'n', 'v')/
+      def find(word, pos)
+        # Map shorthand POS to full forms
+        pos = POS_SHORTHAND[pos] || pos
+        cache = @@cache[pos] ||= build_cache(pos)
+        if found = cache[word]
+          Lemma.new(*found)
+        end
+      end
+      private
+      def build_cache(pos)
+        cache = {}
+        DB.open(File.join("dict", "index.#{pos}")).each_line.each_with_index do |line, index|
+          word = line.slice(0, line.index(SPACE))
+          cache[word] = [line, index+1]
+        end
+        cache
+      end
+    end
+  end
+end

data/lib/rwordnet/pointer.rb ADDED Viewed

@@ -0,0 +1,32 @@
+module WordNet
+  # Pointers represent the relations between the words in one synset and another.
+  class Pointer
+    # The symbol that devices the relationship this pointer represents, e.g. "!" for verb antonym. Valid
+    # pointer symbols are defined in pointers.rb
+    attr_reader :symbol
+    # The offset, in bytes, of this pointer in WordNet's internal database.
+    attr_reader :offset
+    # The part of speech this pointer represents. One of 'n', 'v', 'a' (adjective), or 'r' (adverb).
+    attr_reader :pos
+    # The synset from which this pointer...points.
+    attr_reader :source
+    # The synset to which this pointer...points.
+    attr_reader :target
+    # Create a pointer. Pointers represent the relations between the words in one synset and another,
+    # and are referenced by a shorthand symbol (e.g. '!' for verb antonym). The list
+    # of valid pointer symbols is defined in pointers.rb
+    def initialize(symbol: raise, offset: raise, pos: raise, source: raise)
+      @symbol, @offset, @pos, @source = symbol, offset, pos, source
+      @target = source.slice!(2,2)
+    end
+    def is_semantic?
+      source == "00" && target == "00"
+    end
+  end
+end

data/lib/rwordnet/pointers.rb ADDED Viewed

@@ -0,0 +1,82 @@
+# A container for various constants.
+# In particular, contains constants representing the WordNet symbols used to look up synsets by relation, i.e. Hypernym/Hyponym.
+# Use these symbols in conjunction with the Synset#relation method.
+module WordNet
+  NOUN_POINTERS = {
+    "-c" => "Member of this domain - TOPIC",
+    "+" => "Derivationally related form",
+    "%p" => "Part meronym",
+    "~i" => "Instance Hyponym",
+    "@" => "Hypernym",
+    ";r" => "Domain of synset - REGION",
+    "!" => "Antonym",
+    "#p" => "Part holonym",
+    "%s" => "Substance meronym",
+    ";u" => "Domain of synset - USAGE",
+    "-r" => "Member of this domain - REGION",
+    "#s" => "Substance holonym",
+    "=" => "Attribute",
+    "-u" => "Member of this domain - USAGE",
+    ";c" => "Domain of synset - TOPIC",
+    "%m" => "Member meronym",
+    "~" => "Hyponym",
+    "@i" => "Instance Hypernym",
+    "#m" => "Member holonym"
+  }
+  VERB_POINTERS = {
+    "+" => "Derivationally related form",
+    "@" => "Hypernym",
+    ";r" => "Domain of synset - REGION",
+    "!" => "Antonym",
+    ";u" => "Domain of synset - USAGE",
+    "$" => "Verb Group",
+    ";c" => "Domain of synset - TOPIC",
+    ">" => "Cause",
+    "~" => "Hyponym",
+    "*" => "Entailment"
+  }
+  ADJECTIVE_POINTERS = {
+    ";r" => "Domain of synset - REGION",
+    "!" => "Antonym",
+    "\\" => "Pertainym (pertains to noun)",
+    "<" => "Participle of verb",
+    "&" => "Similar to",
+    "=" => "Attribute",
+    ";c" => "Domain of synset - TOPIC"
+  }
+  ADVERB_POINTERS = {
+    ";r" => "Domain of synset - REGION",
+    "!" => "Antonym",
+    ";u" => "Domain of synset - USAGE",
+    "\\" => "Derived from adjective",
+    ";c" => "Domain of synset - TOPIC"
+  }
+  MEMBER_OF_THIS_DOMAIN_TOPIC = "-c"
+  DERIVATIONALLY_RELATED_FORM = "+"
+  PART_MERONYM = "%p"
+  InstanceHyponym = "~i"
+  HYPERNYM = "@"
+  DOMAIN_OF_SYNSET_REGION = ";r"
+  ANTONYM = "!"
+  PART_HOLONYM = "#p"
+  SUBSTANCE_MERONYM = "%s"
+  VERB_GROUP = "$"
+  DOMAIN_OF_SYNSET_USAGE = ";u"
+  MEMBER_OF_THIS_DOMAIN_REGION = "-r"
+  SUBSTANCE_HOLONYM = "#s"
+  DERIVED_FROM_ADJECTIVE = "\\"
+  PARTICIPLE_OF_VERB = "<"
+  SIMILAR_TO = "&"
+  ATTRIBUTE = "="
+  ALSO_SEE = "^"
+  CAUSE = ">"
+  MEMBER_OF_THIS_DOMAIN_USAGE = "-u"
+  DOMAIN_OF_SYNSET_TOPIC = ";c"
+  MEMBER_MERONYM = "%m"
+  HYPONYM = "~"
+  INSTANCE_HYPERNYM = "@i"
+  ENTAILMENT = "*"
+  MEMBER_HOLONYM = "#m"
+end

data/lib/rwordnet/synset.rb ADDED Viewed

@@ -0,0 +1,286 @@
+module WordNet
+  SYNSET_TYPES = {"n" => "noun", "v" => "verb", "a" => "adj", "r" => "adv"}
+  MORPHOLOGICAL_SUBSTITUTIONS = {
+      'noun' => [['s', ''], ['ses', 's'], ['ves', 'f'], ['xes', 'x'],
+             ['zes', 'z'], ['ches', 'ch'], ['shes', 'sh'],
+             ['men', 'man'], ['ies', 'y']],
+      'verb' => [['s', ''], ['ies', 'y'], ['es', 'e'], ['es', ''],
+             ['ed', 'e'], ['ed', ''], ['ing', 'e'], ['ing', '']],
+      'adj' => [['er', ''], ['est', ''], ['er', 'e'], ['est', 'e']],
+      'adv' => []}
+  # Represents a synset (or group of synonymous words) in WordNet. Synsets are related to each other by various (and numerous!)
+  # relationships, including Hypernym (x is a hypernym of y <=> x is a parent of y) and Hyponym (x is a child of y)
+  class Synset
+    @morphy_path = File.expand_path("../../../morphy/", __FILE__)
+    @exception_map = {}
+    # Get the offset, in bytes, at which this synset's information is stored in WordNet's internal DB.
+    # You almost certainly don't care about this.
+    attr_reader :synset_offset
+    # A two digit decimal integer representing the name of the lexicographer file containing the synset for the sense.
+    # Probably only of interest if you're using a wordnet database marked up with custom attributes, and you
+    # want to ensure that you're using your own additions.
+    attr_reader :lex_filenum
+    # Get the list of words (and their frequencies within the WordNet graph) contained
+    # in this Synset.
+    attr_reader :word_counts
+    # Get the part of speech type of this synset. One of 'n' (noun), 'v' (verb), 'a' (adjective), or 'r' (adverb)
+    attr_reader :synset_type
+    # Get the offset, in bytes, at which this synset's POS information is stored in WordNet's internal DB.
+    # You almost certainly don't care about this.
+    attr_reader :pos_offset
+    # Get a shorthand representation of the part of speech this synset represents, e.g. "v" for verbs.
+    attr_reader :pos
+    # Get a string representation of this synset's gloss. "Gloss" is a human-readable
+    # description of this concept, often with example usage, e.g:
+    #
+    #    move upward; "The fog lifted"; "The smoke arose from the forest fire"; "The mist uprose from the meadows"
+    #
+    # for the second sense of the verb "fall"
+    attr_reader :gloss
+    # Create a new synset by reading from the data file specified by +pos+, at +offset+ bytes into the file. This is how
+    # the WordNet database is organized. You shouldn't be creating Synsets directly; instead, use Lemma#synsets.
+    def initialize(pos, offset)
+      data_line = DB.open(File.join("dict", "data.#{SYNSET_TYPES.fetch(pos)}")) do |f|
+        f.seek(offset)
+        f.readline.strip
+      end
+      info_line, @gloss = data_line.split(" | ", 2)
+      line = info_line.split(" ")
+      @pos = pos
+      @pos_offset = offset
+      @synset_offset = line.shift
+      @lex_filenum = line.shift
+      @synset_type = line.shift
+      @word_counts = {}
+      word_count = line.shift.to_i
+      word_count.times do
+        @word_counts[line.shift] = line.shift.to_i
+      end
+      pointer_count = line.shift.to_i
+      @pointers = Array.new(pointer_count).map do
+        Pointer.new(
+          symbol: line.shift[0],
+          offset: line.shift.to_i,
+          pos: line.shift,
+          source: line.shift
+        )
+      end
+    end
+    # Ported from python NLTK
+    # Load all synsets with a given lemma and part of speech tag.
+    # If no pos is specified, all synsets for all parts of speech
+    # will be loaded.
+    # If lang is specified, all the synsets associated with the lemma name
+    # of that language will be returned.
+    def self.find(word, pos)
+        word = word.downcase
+        lemmas = self.morphy(word, pos).map{|form| WordNet::Lemma.find(form, pos)}
+        lemmas.map{|lemma| lemma.synsets}.flatten
+    end
+    def self.find_all(word)
+        SYNSET_TYPES.values.map{|pos| self.find(word, pos)}.flatten
+    end
+    def self.load_exception_map
+        SYNSET_TYPES.each do |_, pos|
+            @exception_map[pos] = {}
+            File.open(File.join(@morphy_path, 'exceptions', "#{pos}.exc"), 'r').each_line do |line|
+                line = line.split
+                @exception_map[pos][line[0]] = line[1..-1]
+            end
+        end
+    end
+    def self._apply_rules(forms, pos)
+        substitutions = MORPHOLOGICAL_SUBSTITUTIONS[pos]
+        out = []
+        forms.each do |form|
+            substitutions.each do |old, new|
+                if form.end_with? old
+                    out.push form[0...-old.length] + new
+                end
+            end
+        end
+        return out
+    end
+    def self._filter_forms(forms, pos)
+        forms.reject{|form| Lemma.find(form, pos).nil?}.uniq
+    end
+    # ported from nltk python
+    # from jordanbg:
+    # Given an original string x
+    # 1. Apply rules once to the input to get y1, y2, y3, etc.
+    # 2. Return all that are in the database
+    # 3. If there are no matches, keep applying rules until you either
+    #    find a match or you can't go any further
+    def self.morphy(form, pos)
+        if @exception_map == {}
+            self.load_exception_map
+        end
+        exceptions = @exception_map[pos]
+        # 0. Check the exception lists
+        if exceptions.has_key? form
+            return self._filter_forms([form] + exceptions[form], pos)
+        end
+        # 1. Apply rules once to the input to get y1, y2, y3, etc.
+        forms = self._apply_rules([form], pos)
+        # 2. Return all that are in the database (and check the original too)
+        results = self._filter_forms([form] + forms, pos)
+        if results != []
+            return results
+        end
+        # 3. If there are no matches, keep applying rules until we find a match
+        while forms.length > 0
+            forms = self._apply_rules(forms, pos)
+            results = self._filter_forms(forms, pos)
+            if results != []
+                return results
+            end
+        end
+        # Return an empty list if we can't find anything
+        return []
+    end
+    def self.morphy_all(form)
+        SYNSET_TYPES.values.map{|pos| self.morphy(form, pos)}.flatten
+    end
+    # How many words does this Synset include?
+    def word_count
+      @word_counts.size
+    end
+    # Get a list of words included in this Synset
+    def words
+      @word_counts.keys
+    end
+    # Get an array of Synsets with the relation `pointer_symbol` relative to this
+    # Synset. Mostly, this is an internal method used by convience methods (e.g. Synset#antonym), but
+    # it can take any valid valid +pointer_symbol+ defined in pointers.rb.
+    #
+    # Example (get the gloss of an antonym for 'fall'):
+    #     WordNet::Lemma.find("fall", :verb).synsets[1].relation("!")[0].gloss
+    def relation(pointer_symbol)
+      @pointers.select { |pointer| pointer.symbol == pointer_symbol }.
+        map! { |pointer| Synset.new(pointer.pos, pointer.offset) }
+    end
+    # Get the Synsets of this sense's antonym
+    def antonyms
+      relation(ANTONYM)
+    end
+    # Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure).
+    def hypernym
+      relation(HYPERNYM)[0]
+    end
+    # Get the parent synset (higher-level category, i.e. fruit -> reproductive_structure)
+    # as an array.
+    def hypernyms
+      relation(HYPERNYM)
+    end
+    # Get the child synset(s) (i.e., lower-level categories, i.e. fruit -> edible_fruit)
+    def hyponyms
+      relation(HYPONYM)
+    end
+    # Get the entire hyponym tree as an array
+    def expanded_hyponyms
+      children = self.hyponyms
+      return [] if children.empty?
+      return [children, children.collect{|child| child.expanded_hyponyms}.flatten].flatten
+    end
+    # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
+    def expanded_first_hypernyms
+      parent = hypernym
+      list = []
+      return list unless parent
+      while parent
+        break if list.include? parent.pos_offset
+        list.push parent.pos_offset
+        parent = parent.hypernym
+      end
+      list.flatten!
+      list.map! { |offset| Synset.new(@pos, offset)}
+    end
+    # Get the entire hypernym tree (from this synset all the way up to +entity+) as an array.
+    def expanded_hypernyms
+      parents = hypernyms
+      list = []
+      return list unless parents
+      while parents.length > 0
+        parent = parents.pop
+        next if list.include? parent.pos_offset
+        list.push parent.pos_offset
+        parents.push *parent.hypernyms
+      end
+      list.flatten!
+      list.map! { |offset| Synset.new(@pos, offset)}
+    end
+    def expanded_hypernyms_depth
+      parents = hypernyms.map{|hypernym| [hypernym, 1]}
+      list = []
+      out = []
+      return list unless parents
+      max_depth = 1
+      while parents.length > 0
+        parent, depth = parents.pop
+        next if list.include? parent.pos_offset
+        list.push parent.pos_offset
+        out.push [Synset.new(@pos, parent.pos_offset), depth]
+        parents.push *(parent.hypernyms.map{|hypernym| [hypernym, depth + 1]})
+        max_depth = [max_depth, depth].max
+      end
+      return [out, max_depth]
+    end
+    # Returns a compact, human-readable form of this synset, e.g.
+    #
+    #    (v) fall (descend in free fall under the influence of gravity; "The branch fell from the tree"; "The unfortunate hiker fell into a crevasse")
+    #
+    # for the second meaning of the verb "fall."
+    def to_s
+      "(#{@synset_type}) #{words.map { |x| x.tr('_',' ') }.join(', ')} (#{@gloss})"
+    end
+    alias to_str to_s
+    alias size word_count
+    alias parent hypernym
+    alias parents hypernyms
+    alias children hyponyms
+  end
+end

data/lib/rwordnet/version.rb ADDED Viewed

@@ -0,0 +1,3 @@
+module WordNet
+  VERSION = "2.0.1"
+end

data/lib/rwordnet.rb ADDED Viewed

@@ -0,0 +1,5 @@
+require 'rwordnet/pointer'
+require 'rwordnet/db'
+require 'rwordnet/lemma'
+require 'rwordnet/pointers'
+require 'rwordnet/synset'