ngrams 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,2 @@
1
+ require 'ngrams/stdlib_ext'
2
+ require 'ngrams/ngrams'
@@ -0,0 +1,221 @@
1
+ =begin rdoc
2
+ Ngrams - Copyright (c) 2006 Matt Mower <self@mattmower.com>
3
+
4
+ Released under the MIT license (see LICENSE file in the distribution)
5
+
6
+ The Ngram library was written as a way of mucking about with bigram and trigram
7
+ analysis of English words. It was inspired by {Tom Van Vlecks GPW program}[http://www.multicians.org/thvv/tvvtools.html] that uses trigrams to produce pronounceable passwords and an updated version, genpasswd, is included with the library. It addresses the one short-coming of Van Vlecks program which was that it didn't consider trigrams at the beginning of words
8
+ separately to those occurring anywhere in a word.
9
+
10
+ All the real work here is being done by the #Dictionary class which includes methods for parsing ngrams from words and files, for indexing them for fast lookup, and to return random selections based upon frequency of occurence.
11
+
12
+ Possible improvements to this library could include a means to bias the frequency analysis for example to make less common combinations occur more frequently (effectively inverting the probability of occurrence).
13
+
14
+ Two command line utilies are supplied:
15
+
16
+ ngramtool - to build stores from a dictionary file and extract ngrams from a store.
17
+
18
+ genpasswd - to creates random pronounceable passwords using trigrams
19
+
20
+ See --help for each tool
21
+
22
+ The ngrams library comes with a store pre-built (using the standard MacOSX dictionary file) which should be sufficent for most purposes. To use a different dictionary build a new store and pass the location of the store when initializing Dictionary.
23
+ =end
24
+ require 'YAML'
25
+
26
+ require 'ngrams/stdlib_ext'
27
+
28
+ module Ngram
29
+
30
+ # The Dictionary holds an indexed collection of bigrams (2-letter combinations) and
31
+ # trigrams (3-letter combinations) extracted from a dictionary of words.
32
+ #
33
+ # Example usage:
34
+ # dict = Dictionary.load
35
+ # word = dict.ngram( :first, 3 )
36
+ # 5.times { word << dict.next_char( word ) }
37
+ # puts word
38
+ #
39
+ # of course a simpler way to achieve the same would be to use dict.word(8)
40
+ #
41
+ class Dictionary
42
+ attr_accessor :ngrams, :ridx, :walk
43
+
44
+ DEFAULT_STORE = File.join( File.dirname( __FILE__ ), '..', '..', 'data', 'ngrams.yml' )
45
+
46
+ # Return an Dictionary instance initialized using the YAML data in the specified file.
47
+ def self.load( file = DEFAULT_STORE )
48
+ File.open( file ) { |file| YAML::load( file ) }
49
+ end
50
+
51
+ # Initialize a new, empty, Dictionary.
52
+ #
53
+ # Use #add_from_file or #add_from_word to load new ngrams into the dictionary. Once
54
+ # all words have been loaded call #build_indices to ready the dictionary for use and
55
+ # #store to save it to disk.
56
+ def initialize
57
+ @ngrams = {
58
+ :first => {
59
+ 2 => Hash.new( 0 ),
60
+ 3 => Hash.new( 0 )
61
+ },
62
+ :any => {
63
+ 2 => Hash.new( 0 ),
64
+ 3 => Hash.new( 0 )
65
+ }
66
+ }
67
+
68
+ init_reverse_index
69
+ init_walk_tree
70
+ end
71
+
72
+ # Returns a randomly selected 2 or 3 character ngram string
73
+ #
74
+ # Specifying type :first will select only ngrams that appear at the beginning of words
75
+ # from the source dictonary. Type :any will select ngrams that appear anywhere in a word.
76
+ #
77
+ # length can be either 2 (bigram) or 3 (trigram)
78
+ #
79
+ # The Dictionary tracks the frequency of each ngram and the random selection is weighted
80
+ # such that the probability of any ngram being selected is proportional to its frequency
81
+ # in the source dictionary.
82
+ def ngram( type, length )
83
+ r = Integer( @sigma[type][length] * rand )
84
+ @ridx[type][length].detect { |sum,_| sum >= r }.last.dup
85
+ end
86
+
87
+ # Returns a randomly selected character to follow the input. Repeated calls to this method
88
+ # implement a random-walk through the ngrams in the dictionary given a specified starting point.
89
+ #
90
+ # Either supply a string parameter containing a word for completion or two
91
+ # single characters. The following calls are equivalent:
92
+ #
93
+ # next_char( 'a', 'b' )
94
+ # next_char( 'ab' )
95
+ #
96
+ # In both cases the call will return a randomly selected character to follow the specified
97
+ # characters. The Dictionary tracks the frequency of each ngram and the random selection
98
+ # is weighted such that the probability of any following character being selected is proportional
99
+ # to the frequency with which it follows the specified characters in the source dictionary.
100
+ def next_char( a, b = nil )
101
+ if b.nil?
102
+ a, b = a[-2,1], a[-1,1]
103
+ end
104
+ r = Integer( @walk[a][b].first * rand )
105
+ @walk[a][b].last.detect { |sum,c| sum >= r }.last.dup
106
+ end
107
+
108
+ # Returns a word created by selecting a starting ngram and then doing a random walk
109
+ # to add the remaining characters to the specified length.
110
+ def word( length )
111
+ s = ngram( :first, 3 )
112
+ ( length - 3 ).times { s << next_char( s ) }
113
+ s
114
+ end
115
+
116
+ # Store the Ngram dictionary and indices to a file using YAML
117
+ def save( file )
118
+ File.open( file, "w" ) do |file|
119
+ YAML::dump( self, file )
120
+ end
121
+ end
122
+
123
+ # Add ngrams to the current dictionary corresponding to the words found in
124
+ # the specified file. The file should contain one word per line and
125
+ # (ideally) only use alpha characters.
126
+ def add_from_file( file )
127
+ File.open( file, "r" ) do |file|
128
+ file.each { |line| add_from_word( line.chomp.downcase ) }
129
+ end
130
+ end
131
+
132
+ # Add ngrams to the current dictionary using the given word as a source.
133
+ def add_from_word( word )
134
+ 2.upto( 3 ) do |n|
135
+ ngrams = word.ngrams( n )
136
+
137
+ unless ngrams.size == 0
138
+ inc( :first, n, ngrams.first )
139
+ ngrams.each { |ngram| inc( :any, n, ngram ) }
140
+ end
141
+ end
142
+ end
143
+
144
+ # Used to build the reverse index and trees that are used to by the
145
+ # random selection and walk code. If using a new dictionary (rather than
146
+ # a dictionary obtained via #load) call this before using #word, #ngram, or
147
+ # #next_char
148
+ def build_indices
149
+ build_reverse_index
150
+ build_walk_tree
151
+ end
152
+
153
+ private
154
+ def build_reverse_index
155
+ init_reverse_index
156
+ [ [:first,2], [:first,3], [:any,2], [:any,3] ].each do |type,n|
157
+ accumulator = 0
158
+ @ngrams[type][n].each do |ngram,score|
159
+ accumulator += score
160
+ @ridx[type][n] = @ridx[type][n] << [accumulator,ngram]
161
+ end
162
+ @sigma[type][n] = @ngrams[type][n].inject( 0 ) { |injection, element| injection+element.last }
163
+ end
164
+ end
165
+
166
+ def build_walk_tree
167
+ init_walk_tree
168
+ @ngrams[:any][3].each do |ngram,score|
169
+ a, b, c = ngram.unpack( "aaa" )
170
+ @walk[a][b][0] += score
171
+ @walk[a][b][1] << [@walk[a][b][0],c]
172
+ end
173
+ end
174
+
175
+ def init_reverse_index
176
+ @ridx = {
177
+ :first => {
178
+ 2 => [],
179
+ 3 => []
180
+ },
181
+ :any => {
182
+ 2 => [],
183
+ 3 => []
184
+ }
185
+ }
186
+ @sigma = {
187
+ :first => {
188
+ 2 => 0,
189
+ 3 => 0
190
+ },
191
+ :any => {
192
+ 2 => 0,
193
+ 3 => 0
194
+ }
195
+ }
196
+ end
197
+
198
+ def init_walk_tree
199
+ @walk = {}
200
+ ('a'..'z').each do |a|
201
+ @walk[a] = {}
202
+ ('a'..'z').each do |b|
203
+ @walk[a][b] = [0,[]]
204
+ end
205
+ end
206
+ end
207
+
208
+ def inc( type, n, ngram )
209
+ @ngrams[type][n][ngram] = @ngrams[type][n].has_key?( ngram ) ? @ngrams[type][n][ngram]+1 : 1
210
+ end
211
+ end
212
+
213
+ end
214
+
215
+ if __FILE__ == $0
216
+ if ARGV.length > 0
217
+ ngs = Ngram::Dictionary.new
218
+ ngs.parse_from_file( ARGV[0] )
219
+ ngs.store
220
+ end
221
+ end
@@ -0,0 +1,22 @@
1
+ require 'ngrams'
2
+ include Ngram
3
+
4
+ module PwdGen
5
+
6
+ class PasswordGenerator
7
+
8
+ def initialize( file = Dictionary::DEFAULT_STORE )
9
+ @ngs = Dictionary.load( file )
10
+ end
11
+
12
+ def generate_n( n, length )
13
+ Array.new( n, nil ).map { |_| generate( length ) }
14
+ end
15
+
16
+ def generate( length )
17
+ @ngs.word( length )
18
+ end
19
+
20
+ end
21
+
22
+ end
@@ -0,0 +1,13 @@
1
+ #
2
+ # Extend standard library classes with methods required by Ngrams
3
+ #
4
+ if !String.respond_to? :ngrams
5
+ class String
6
+ # Return the result of splitting the string into an array of ngrams of length n.
7
+ def ngrams( n )
8
+ (0..self.length-n).to_a.collect { |idx| self[idx,n] }
9
+ end
10
+ end
11
+ else
12
+ raise "Cannot patch in String#ngrams as it is already defined!"
13
+ end
metadata ADDED
@@ -0,0 +1,54 @@
1
+ --- !ruby/object:Gem::Specification
2
+ rubygems_version: 0.9.0
3
+ specification_version: !int:Fixnum 1
4
+ name: ngrams
5
+ version: !ruby/object:Gem::Version
6
+ version: 0.1.0
7
+ date: 2006-07-23 00:00:00 +01:00
8
+ summary: A library for manipulating bigrams and trigrams to generate pronouncable words.
9
+ require_paths:
10
+ - lib
11
+ email: self@mattmower.com
12
+ homepage: http://rubyforge.org/projects/ngrams/
13
+ rubyforge_project: ngrams
14
+ description:
15
+ autorequire: ngrams
16
+ default_executable:
17
+ bindir: bin
18
+ has_rdoc: true
19
+ required_ruby_version: !ruby/object:Gem::Version::Requirement
20
+ requirements:
21
+ - - ">"
22
+ - !ruby/object:Gem::Version
23
+ version: 0.0.0
24
+ version:
25
+ platform: ruby
26
+ signing_key:
27
+ cert_chain:
28
+ post_install_message:
29
+ authors:
30
+ - Matt Mower
31
+ files:
32
+ - lib/ngrams.rb
33
+ - lib/ngrams/ngrams.rb
34
+ - lib/ngrams/pwdgen.rb
35
+ - lib/ngrams/stdlib_ext.rb
36
+ - data/ngrams.yml
37
+ - bin/genpasswd
38
+ - bin/ngramtool
39
+ - LICENSE
40
+ test_files: []
41
+
42
+ rdoc_options: []
43
+
44
+ extra_rdoc_files: []
45
+
46
+ executables:
47
+ - genpasswd
48
+ - ngramtool
49
+ extensions: []
50
+
51
+ requirements: []
52
+
53
+ dependencies: []
54
+