RubyGems - ngrams - Versions diffs - 0.1.0 - Mend

ngrams 0.1.0

Files changed (9) hide show

data/lib/ngrams.rb ADDED

	@@ -0,0 +1,2 @@
1	+ require 'ngrams/stdlib_ext'
2	+ require 'ngrams/ngrams'

data/lib/ngrams/ngrams.rb ADDED

@@ -0,0 +1,221 @@
+=begin rdoc
+Ngrams - Copyright (c) 2006 Matt Mower <self@mattmower.com>
+Released under the MIT license (see LICENSE file in the distribution)
+The Ngram library was written as a way of mucking about with bigram and trigram
+analysis of English words. It was inspired by {Tom Van Vlecks GPW program}[http://www.multicians.org/thvv/tvvtools.html] that uses trigrams to produce pronounceable passwords and an updated version, genpasswd, is included with the library. It addresses the one short-coming of Van Vlecks program which was that it didn't consider trigrams at the beginning of words
+separately to those occurring anywhere in a word.
+All the real work here is being done by the #Dictionary class which includes methods for parsing ngrams from words and files, for indexing them for fast lookup, and to return random selections based upon frequency of occurence.
+Possible improvements to this library could include a means to bias the frequency analysis for example to make less common combinations occur more frequently (effectively inverting the probability of occurrence).
+Two command line utilies are supplied:
+ngramtool - to build stores from a dictionary file and extract ngrams from a store.
+genpasswd - to creates random pronounceable passwords using trigrams
+See --help for each tool
+The ngrams library comes with a store pre-built (using the standard MacOSX dictionary file) which should be sufficent for most purposes. To use a different dictionary build a new store and pass the location of the store when initializing Dictionary.
+=end
+require 'YAML'
+require 'ngrams/stdlib_ext'
+module Ngram
+  # The Dictionary holds an indexed collection of bigrams (2-letter combinations) and
+  # trigrams (3-letter combinations) extracted from a dictionary of words.
+  #
+  # Example usage:
+  #   dict = Dictionary.load
+  #   word = dict.ngram( :first, 3 )
+  #   5.times { word << dict.next_char( word ) }
+  #   puts word
+  #
+  # of course a simpler way to achieve the same would be to use dict.word(8)
+  #
+  class Dictionary
+    attr_accessor :ngrams, :ridx, :walk
+    DEFAULT_STORE = File.join( File.dirname( __FILE__ ), '..', '..', 'data', 'ngrams.yml' )
+    # Return an Dictionary instance initialized using the YAML data in the specified file.
+    def self.load( file = DEFAULT_STORE )
+      File.open( file ) { |file| YAML::load( file ) }
+    end
+    # Initialize a new, empty, Dictionary.
+    #
+    # Use #add_from_file or #add_from_word to load new ngrams into the dictionary. Once
+    # all words have been loaded call #build_indices to ready the dictionary for use and
+    # #store to save it to disk.
+    def initialize
+      @ngrams = {
+        :first => {
+          2 => Hash.new( 0 ),
+          3 => Hash.new( 0 )
+        },
+        :any => {
+          2 => Hash.new( 0 ),
+          3 => Hash.new( 0 )
+        }
+      }
+      init_reverse_index
+      init_walk_tree
+    end
+    # Returns a randomly selected 2 or 3 character ngram string
+    #
+    # Specifying type :first will select only ngrams that appear at the beginning of words
+    # from the source dictonary. Type :any will select ngrams that appear anywhere in a word.
+    #
+    # length can be either 2 (bigram) or 3 (trigram)
+    #
+    # The Dictionary tracks the frequency of each ngram and the random selection is weighted
+    # such that the probability of any ngram being selected is proportional to its frequency
+    # in the source dictionary.
+    def ngram( type, length )
+      r = Integer( @sigma[type][length] * rand )
+      @ridx[type][length].detect { |sum,_| sum >= r }.last.dup
+    end
+    # Returns a randomly selected character to follow the input. Repeated calls to this method
+    # implement a random-walk through the ngrams in the dictionary given a specified starting point.
+    #
+    # Either supply a string parameter containing a word for completion or two
+    # single characters. The following calls are equivalent:
+    #
+    #   next_char( 'a', 'b' )
+    #   next_char( 'ab' )
+    #
+    # In both cases the call will return a randomly selected character to follow the specified
+    # characters. The Dictionary tracks the frequency of each ngram and the random selection
+    # is weighted such that the probability of any following character being selected is proportional
+    # to the frequency with which it follows the specified characters in the source dictionary.
+    def next_char( a, b = nil )
+      if b.nil?
+        a, b = a[-2,1], a[-1,1]
+      end
+      r = Integer( @walk[a][b].first * rand )
+      @walk[a][b].last.detect { |sum,c| sum >= r }.last.dup
+    end
+    # Returns a word created by selecting a starting ngram and then doing a random walk
+    # to add the remaining characters to the specified length.
+    def word( length )
+      s = ngram( :first, 3 )
+      ( length - 3 ).times { s << next_char( s ) }
+      s
+    end
+    # Store the Ngram dictionary and indices to a file using YAML
+    def save( file )
+      File.open( file, "w" ) do |file|
+        YAML::dump( self, file )
+      end
+    end
+    # Add ngrams to the current dictionary corresponding to the words found in
+    # the specified file. The file should contain one word per line and
+    # (ideally) only use alpha characters.
+    def add_from_file( file )
+      File.open( file, "r" ) do |file|
+        file.each { |line| add_from_word( line.chomp.downcase ) }
+      end
+    end
+    # Add ngrams to the current dictionary using the given word as a source.
+    def add_from_word( word )
+      2.upto( 3 ) do |n|
+        ngrams = word.ngrams( n )
+        unless ngrams.size == 0
+          inc( :first, n, ngrams.first )
+          ngrams.each { |ngram| inc( :any, n, ngram ) }
+        end
+      end
+    end
+    # Used to build the reverse index and trees that are used to by the
+    # random selection and walk code. If using a new dictionary (rather than
+    # a dictionary obtained via #load) call this before using #word, #ngram, or
+    # #next_char
+    def build_indices
+      build_reverse_index
+      build_walk_tree
+    end
+  private
+    def build_reverse_index
+      init_reverse_index
+      [ [:first,2], [:first,3], [:any,2], [:any,3] ].each do |type,n|
+        accumulator = 0
+        @ngrams[type][n].each do |ngram,score|
+            accumulator += score
+            @ridx[type][n] = @ridx[type][n] << [accumulator,ngram]
+        end
+        @sigma[type][n] = @ngrams[type][n].inject( 0 ) { |injection, element| injection+element.last }
+      end
+    end
+    def build_walk_tree
+      init_walk_tree
+      @ngrams[:any][3].each do |ngram,score|
+        a, b, c = ngram.unpack( "aaa" )
+        @walk[a][b][0] += score
+        @walk[a][b][1] << [@walk[a][b][0],c]
+      end
+    end
+    def init_reverse_index
+      @ridx = {
+        :first => {
+          2 => [],
+          3 => []
+        },
+        :any => {
+          2 => [],
+          3 => []
+        }
+      }
+      @sigma = {
+        :first => {
+          2 => 0,
+          3 => 0
+        },
+        :any => {
+          2 => 0,
+          3 => 0
+        }
+      }
+    end
+    def init_walk_tree
+      @walk = {}
+      ('a'..'z').each do |a|
+        @walk[a] = {}
+        ('a'..'z').each do |b|
+          @walk[a][b] = [0,[]]
+        end
+      end
+    end
+    def inc( type, n, ngram )
+      @ngrams[type][n][ngram] = @ngrams[type][n].has_key?( ngram ) ? @ngrams[type][n][ngram]+1 : 1
+    end
+  end
+end
+if __FILE__ == $0
+  if ARGV.length > 0
+    ngs = Ngram::Dictionary.new
+    ngs.parse_from_file( ARGV[0] )
+    ngs.store
+  end
+end

data/lib/ngrams/pwdgen.rb ADDED

@@ -0,0 +1,22 @@
+require 'ngrams'
+include Ngram
+module PwdGen
+  class PasswordGenerator
+    def initialize( file = Dictionary::DEFAULT_STORE )
+      @ngs = Dictionary.load( file )
+    end
+    def generate_n( n, length )
+      Array.new( n, nil ).map { |_| generate( length ) }
+    end
+    def generate( length )
+      @ngs.word( length )
+    end
+  end
+end

data/lib/ngrams/stdlib_ext.rb ADDED

@@ -0,0 +1,13 @@
+#
+# Extend standard library classes with methods required by Ngrams
+#
+if !String.respond_to? :ngrams
+  class String
+    # Return the result of splitting the string into an array of ngrams of length n.
+    def ngrams( n )
+      (0..self.length-n).to_a.collect { |idx| self[idx,n] }
+    end
+  end
+else
+  raise "Cannot patch in String#ngrams as it is already defined!"
+end

metadata ADDED

@@ -0,0 +1,54 @@
+--- !ruby/object:Gem::Specification
+rubygems_version: 0.9.0
+specification_version: !int:Fixnum 1
+name: ngrams
+version: !ruby/object:Gem::Version
+  version: 0.1.0
+date: 2006-07-23 00:00:00 +01:00
+summary: A library for manipulating bigrams and trigrams to generate pronouncable words.
+require_paths:
+- lib
+email: self@mattmower.com
+homepage: http://rubyforge.org/projects/ngrams/
+rubyforge_project: ngrams
+description:
+autorequire: ngrams
+default_executable:
+bindir: bin
+has_rdoc: true
+required_ruby_version: !ruby/object:Gem::Version::Requirement
+  requirements:
+  - - ">"
+    - !ruby/object:Gem::Version
+      version: 0.0.0
+  version:
+platform: ruby
+signing_key:
+cert_chain:
+post_install_message:
+authors:
+- Matt Mower
+files:
+- lib/ngrams.rb
+- lib/ngrams/ngrams.rb
+- lib/ngrams/pwdgen.rb
+- lib/ngrams/stdlib_ext.rb
+- data/ngrams.yml
+- bin/genpasswd
+- bin/ngramtool
+- LICENSE
+test_files: []
+rdoc_options: []
+extra_rdoc_files: []
+executables:
+- genpasswd
+- ngramtool
+extensions: []
+requirements: []
+dependencies: []