ngrams 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,2 @@
1
+ require 'ngrams/stdlib_ext'
2
+ require 'ngrams/ngrams'
@@ -0,0 +1,221 @@
1
+ =begin rdoc
2
+ Ngrams - Copyright (c) 2006 Matt Mower <self@mattmower.com>
3
+
4
+ Released under the MIT license (see LICENSE file in the distribution)
5
+
6
+ The Ngram library was written as a way of mucking about with bigram and trigram
7
+ analysis of English words. It was inspired by {Tom Van Vlecks GPW program}[http://www.multicians.org/thvv/tvvtools.html] that uses trigrams to produce pronounceable passwords and an updated version, genpasswd, is included with the library. It addresses the one short-coming of Van Vlecks program which was that it didn't consider trigrams at the beginning of words
8
+ separately to those occurring anywhere in a word.
9
+
10
+ All the real work here is being done by the #Dictionary class which includes methods for parsing ngrams from words and files, for indexing them for fast lookup, and to return random selections based upon frequency of occurence.
11
+
12
+ Possible improvements to this library could include a means to bias the frequency analysis for example to make less common combinations occur more frequently (effectively inverting the probability of occurrence).
13
+
14
+ Two command line utilies are supplied:
15
+
16
+ ngramtool - to build stores from a dictionary file and extract ngrams from a store.
17
+
18
+ genpasswd - to creates random pronounceable passwords using trigrams
19
+
20
+ See --help for each tool
21
+
22
+ The ngrams library comes with a store pre-built (using the standard MacOSX dictionary file) which should be sufficent for most purposes. To use a different dictionary build a new store and pass the location of the store when initializing Dictionary.
23
+ =end
24
+ require 'YAML'
25
+
26
+ require 'ngrams/stdlib_ext'
27
+
28
+ module Ngram
29
+
30
+ # The Dictionary holds an indexed collection of bigrams (2-letter combinations) and
31
+ # trigrams (3-letter combinations) extracted from a dictionary of words.
32
+ #
33
+ # Example usage:
34
+ # dict = Dictionary.load
35
+ # word = dict.ngram( :first, 3 )
36
+ # 5.times { word << dict.next_char( word ) }
37
+ # puts word
38
+ #
39
+ # of course a simpler way to achieve the same would be to use dict.word(8)
40
+ #
41
+ class Dictionary
42
+ attr_accessor :ngrams, :ridx, :walk
43
+
44
+ DEFAULT_STORE = File.join( File.dirname( __FILE__ ), '..', '..', 'data', 'ngrams.yml' )
45
+
46
+ # Return an Dictionary instance initialized using the YAML data in the specified file.
47
+ def self.load( file = DEFAULT_STORE )
48
+ File.open( file ) { |file| YAML::load( file ) }
49
+ end
50
+
51
+ # Initialize a new, empty, Dictionary.
52
+ #
53
+ # Use #add_from_file or #add_from_word to load new ngrams into the dictionary. Once
54
+ # all words have been loaded call #build_indices to ready the dictionary for use and
55
+ # #store to save it to disk.
56
+ def initialize
57
+ @ngrams = {
58
+ :first => {
59
+ 2 => Hash.new( 0 ),
60
+ 3 => Hash.new( 0 )
61
+ },
62
+ :any => {
63
+ 2 => Hash.new( 0 ),
64
+ 3 => Hash.new( 0 )
65
+ }
66
+ }
67
+
68
+ init_reverse_index
69
+ init_walk_tree
70
+ end
71
+
72
+ # Returns a randomly selected 2 or 3 character ngram string
73
+ #
74
+ # Specifying type :first will select only ngrams that appear at the beginning of words
75
+ # from the source dictonary. Type :any will select ngrams that appear anywhere in a word.
76
+ #
77
+ # length can be either 2 (bigram) or 3 (trigram)
78
+ #
79
+ # The Dictionary tracks the frequency of each ngram and the random selection is weighted
80
+ # such that the probability of any ngram being selected is proportional to its frequency
81
+ # in the source dictionary.
82
+ def ngram( type, length )
83
+ r = Integer( @sigma[type][length] * rand )
84
+ @ridx[type][length].detect { |sum,_| sum >= r }.last.dup
85
+ end
86
+
87
+ # Returns a randomly selected character to follow the input. Repeated calls to this method
88
+ # implement a random-walk through the ngrams in the dictionary given a specified starting point.
89
+ #
90
+ # Either supply a string parameter containing a word for completion or two
91
+ # single characters. The following calls are equivalent:
92
+ #
93
+ # next_char( 'a', 'b' )
94
+ # next_char( 'ab' )
95
+ #
96
+ # In both cases the call will return a randomly selected character to follow the specified
97
+ # characters. The Dictionary tracks the frequency of each ngram and the random selection
98
+ # is weighted such that the probability of any following character being selected is proportional
99
+ # to the frequency with which it follows the specified characters in the source dictionary.
100
+ def next_char( a, b = nil )
101
+ if b.nil?
102
+ a, b = a[-2,1], a[-1,1]
103
+ end
104
+ r = Integer( @walk[a][b].first * rand )
105
+ @walk[a][b].last.detect { |sum,c| sum >= r }.last.dup
106
+ end
107
+
108
+ # Returns a word created by selecting a starting ngram and then doing a random walk
109
+ # to add the remaining characters to the specified length.
110
+ def word( length )
111
+ s = ngram( :first, 3 )
112
+ ( length - 3 ).times { s << next_char( s ) }
113
+ s
114
+ end
115
+
116
+ # Store the Ngram dictionary and indices to a file using YAML
117
+ def save( file )
118
+ File.open( file, "w" ) do |file|
119
+ YAML::dump( self, file )
120
+ end
121
+ end
122
+
123
+ # Add ngrams to the current dictionary corresponding to the words found in
124
+ # the specified file. The file should contain one word per line and
125
+ # (ideally) only use alpha characters.
126
+ def add_from_file( file )
127
+ File.open( file, "r" ) do |file|
128
+ file.each { |line| add_from_word( line.chomp.downcase ) }
129
+ end
130
+ end
131
+
132
+ # Add ngrams to the current dictionary using the given word as a source.
133
+ def add_from_word( word )
134
+ 2.upto( 3 ) do |n|
135
+ ngrams = word.ngrams( n )
136
+
137
+ unless ngrams.size == 0
138
+ inc( :first, n, ngrams.first )
139
+ ngrams.each { |ngram| inc( :any, n, ngram ) }
140
+ end
141
+ end
142
+ end
143
+
144
+ # Used to build the reverse index and trees that are used to by the
145
+ # random selection and walk code. If using a new dictionary (rather than
146
+ # a dictionary obtained via #load) call this before using #word, #ngram, or
147
+ # #next_char
148
+ def build_indices
149
+ build_reverse_index
150
+ build_walk_tree
151
+ end
152
+
153
+ private
154
+ def build_reverse_index
155
+ init_reverse_index
156
+ [ [:first,2], [:first,3], [:any,2], [:any,3] ].each do |type,n|
157
+ accumulator = 0
158
+ @ngrams[type][n].each do |ngram,score|
159
+ accumulator += score
160
+ @ridx[type][n] = @ridx[type][n] << [accumulator,ngram]
161
+ end
162
+ @sigma[type][n] = @ngrams[type][n].inject( 0 ) { |injection, element| injection+element.last }
163
+ end
164
+ end
165
+
166
+ def build_walk_tree
167
+ init_walk_tree
168
+ @ngrams[:any][3].each do |ngram,score|
169
+ a, b, c = ngram.unpack( "aaa" )
170
+ @walk[a][b][0] += score
171
+ @walk[a][b][1] << [@walk[a][b][0],c]
172
+ end
173
+ end
174
+
175
+ def init_reverse_index
176
+ @ridx = {
177
+ :first => {
178
+ 2 => [],
179
+ 3 => []
180
+ },
181
+ :any => {
182
+ 2 => [],
183
+ 3 => []
184
+ }
185
+ }
186
+ @sigma = {
187
+ :first => {
188
+ 2 => 0,
189
+ 3 => 0
190
+ },
191
+ :any => {
192
+ 2 => 0,
193
+ 3 => 0
194
+ }
195
+ }
196
+ end
197
+
198
+ def init_walk_tree
199
+ @walk = {}
200
+ ('a'..'z').each do |a|
201
+ @walk[a] = {}
202
+ ('a'..'z').each do |b|
203
+ @walk[a][b] = [0,[]]
204
+ end
205
+ end
206
+ end
207
+
208
+ def inc( type, n, ngram )
209
+ @ngrams[type][n][ngram] = @ngrams[type][n].has_key?( ngram ) ? @ngrams[type][n][ngram]+1 : 1
210
+ end
211
+ end
212
+
213
+ end
214
+
215
+ if __FILE__ == $0
216
+ if ARGV.length > 0
217
+ ngs = Ngram::Dictionary.new
218
+ ngs.parse_from_file( ARGV[0] )
219
+ ngs.store
220
+ end
221
+ end
@@ -0,0 +1,22 @@
1
+ require 'ngrams'
2
+ include Ngram
3
+
4
+ module PwdGen
5
+
6
+ class PasswordGenerator
7
+
8
+ def initialize( file = Dictionary::DEFAULT_STORE )
9
+ @ngs = Dictionary.load( file )
10
+ end
11
+
12
+ def generate_n( n, length )
13
+ Array.new( n, nil ).map { |_| generate( length ) }
14
+ end
15
+
16
+ def generate( length )
17
+ @ngs.word( length )
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,13 @@
1
+ #
2
+ # Extend standard library classes with methods required by Ngrams
3
+ #
4
+ if !String.respond_to? :ngrams
5
+ class String
6
+ # Return the result of splitting the string into an array of ngrams of length n.
7
+ def ngrams( n )
8
+ (0..self.length-n).to_a.collect { |idx| self[idx,n] }
9
+ end
10
+ end
11
+ else
12
+ raise "Cannot patch in String#ngrams as it is already defined!"
13
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: !int:Fixnum 1
4
+ name: ngrams
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-07-23 00:00:00 +01:00
8
+ summary: A library for manipulating bigrams and trigrams to generate pronouncable words.
9
+ require_paths:
10
+ - lib
11
+ email: self@mattmower.com
12
+ homepage: http://rubyforge.org/projects/ngrams/
13
+ rubyforge_project: ngrams
14
+ description:
15
+ autorequire: ngrams
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Matt Mower
31
+ files:
32
+ - lib/ngrams.rb
33
+ - lib/ngrams/ngrams.rb
34
+ - lib/ngrams/pwdgen.rb
35
+ - lib/ngrams/stdlib_ext.rb
36
+ - data/ngrams.yml
37
+ - bin/genpasswd
38
+ - bin/ngramtool
39
+ - LICENSE
40
+ test_files: []
41
+
42
+ rdoc_options: []
43
+
44
+ extra_rdoc_files: []
45
+
46
+ executables:
47
+ - genpasswd
48
+ - ngramtool
49
+ extensions: []
50
+
51
+ requirements: []
52
+
53
+ dependencies: []
54
+