ngrams 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE +9 -0
- data/bin/genpasswd +41 -0
- data/bin/ngramtool +87 -0
- data/data/ngrams.yml +56576 -0
- data/lib/ngrams.rb +2 -0
- data/lib/ngrams/ngrams.rb +221 -0
- data/lib/ngrams/pwdgen.rb +22 -0
- data/lib/ngrams/stdlib_ext.rb +13 -0
- metadata +54 -0
data/lib/ngrams.rb
ADDED
@@ -0,0 +1,221 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Ngrams - Copyright (c) 2006 Matt Mower <self@mattmower.com>
|
3
|
+
|
4
|
+
Released under the MIT license (see LICENSE file in the distribution)
|
5
|
+
|
6
|
+
The Ngram library was written as a way of mucking about with bigram and trigram
|
7
|
+
analysis of English words. It was inspired by {Tom Van Vlecks GPW program}[http://www.multicians.org/thvv/tvvtools.html] that uses trigrams to produce pronounceable passwords and an updated version, genpasswd, is included with the library. It addresses the one short-coming of Van Vlecks program which was that it didn't consider trigrams at the beginning of words
|
8
|
+
separately to those occurring anywhere in a word.
|
9
|
+
|
10
|
+
All the real work here is being done by the #Dictionary class which includes methods for parsing ngrams from words and files, for indexing them for fast lookup, and to return random selections based upon frequency of occurence.
|
11
|
+
|
12
|
+
Possible improvements to this library could include a means to bias the frequency analysis for example to make less common combinations occur more frequently (effectively inverting the probability of occurrence).
|
13
|
+
|
14
|
+
Two command line utilies are supplied:
|
15
|
+
|
16
|
+
ngramtool - to build stores from a dictionary file and extract ngrams from a store.
|
17
|
+
|
18
|
+
genpasswd - to creates random pronounceable passwords using trigrams
|
19
|
+
|
20
|
+
See --help for each tool
|
21
|
+
|
22
|
+
The ngrams library comes with a store pre-built (using the standard MacOSX dictionary file) which should be sufficent for most purposes. To use a different dictionary build a new store and pass the location of the store when initializing Dictionary.
|
23
|
+
=end
|
24
|
+
require 'YAML'
|
25
|
+
|
26
|
+
require 'ngrams/stdlib_ext'
|
27
|
+
|
28
|
+
module Ngram
|
29
|
+
|
30
|
+
# The Dictionary holds an indexed collection of bigrams (2-letter combinations) and
|
31
|
+
# trigrams (3-letter combinations) extracted from a dictionary of words.
|
32
|
+
#
|
33
|
+
# Example usage:
|
34
|
+
# dict = Dictionary.load
|
35
|
+
# word = dict.ngram( :first, 3 )
|
36
|
+
# 5.times { word << dict.next_char( word ) }
|
37
|
+
# puts word
|
38
|
+
#
|
39
|
+
# of course a simpler way to achieve the same would be to use dict.word(8)
|
40
|
+
#
|
41
|
+
class Dictionary
|
42
|
+
attr_accessor :ngrams, :ridx, :walk
|
43
|
+
|
44
|
+
DEFAULT_STORE = File.join( File.dirname( __FILE__ ), '..', '..', 'data', 'ngrams.yml' )
|
45
|
+
|
46
|
+
# Return an Dictionary instance initialized using the YAML data in the specified file.
|
47
|
+
def self.load( file = DEFAULT_STORE )
|
48
|
+
File.open( file ) { |file| YAML::load( file ) }
|
49
|
+
end
|
50
|
+
|
51
|
+
# Initialize a new, empty, Dictionary.
|
52
|
+
#
|
53
|
+
# Use #add_from_file or #add_from_word to load new ngrams into the dictionary. Once
|
54
|
+
# all words have been loaded call #build_indices to ready the dictionary for use and
|
55
|
+
# #store to save it to disk.
|
56
|
+
def initialize
|
57
|
+
@ngrams = {
|
58
|
+
:first => {
|
59
|
+
2 => Hash.new( 0 ),
|
60
|
+
3 => Hash.new( 0 )
|
61
|
+
},
|
62
|
+
:any => {
|
63
|
+
2 => Hash.new( 0 ),
|
64
|
+
3 => Hash.new( 0 )
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
init_reverse_index
|
69
|
+
init_walk_tree
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns a randomly selected 2 or 3 character ngram string
|
73
|
+
#
|
74
|
+
# Specifying type :first will select only ngrams that appear at the beginning of words
|
75
|
+
# from the source dictonary. Type :any will select ngrams that appear anywhere in a word.
|
76
|
+
#
|
77
|
+
# length can be either 2 (bigram) or 3 (trigram)
|
78
|
+
#
|
79
|
+
# The Dictionary tracks the frequency of each ngram and the random selection is weighted
|
80
|
+
# such that the probability of any ngram being selected is proportional to its frequency
|
81
|
+
# in the source dictionary.
|
82
|
+
def ngram( type, length )
|
83
|
+
r = Integer( @sigma[type][length] * rand )
|
84
|
+
@ridx[type][length].detect { |sum,_| sum >= r }.last.dup
|
85
|
+
end
|
86
|
+
|
87
|
+
# Returns a randomly selected character to follow the input. Repeated calls to this method
|
88
|
+
# implement a random-walk through the ngrams in the dictionary given a specified starting point.
|
89
|
+
#
|
90
|
+
# Either supply a string parameter containing a word for completion or two
|
91
|
+
# single characters. The following calls are equivalent:
|
92
|
+
#
|
93
|
+
# next_char( 'a', 'b' )
|
94
|
+
# next_char( 'ab' )
|
95
|
+
#
|
96
|
+
# In both cases the call will return a randomly selected character to follow the specified
|
97
|
+
# characters. The Dictionary tracks the frequency of each ngram and the random selection
|
98
|
+
# is weighted such that the probability of any following character being selected is proportional
|
99
|
+
# to the frequency with which it follows the specified characters in the source dictionary.
|
100
|
+
def next_char( a, b = nil )
|
101
|
+
if b.nil?
|
102
|
+
a, b = a[-2,1], a[-1,1]
|
103
|
+
end
|
104
|
+
r = Integer( @walk[a][b].first * rand )
|
105
|
+
@walk[a][b].last.detect { |sum,c| sum >= r }.last.dup
|
106
|
+
end
|
107
|
+
|
108
|
+
# Returns a word created by selecting a starting ngram and then doing a random walk
|
109
|
+
# to add the remaining characters to the specified length.
|
110
|
+
def word( length )
|
111
|
+
s = ngram( :first, 3 )
|
112
|
+
( length - 3 ).times { s << next_char( s ) }
|
113
|
+
s
|
114
|
+
end
|
115
|
+
|
116
|
+
# Store the Ngram dictionary and indices to a file using YAML
|
117
|
+
def save( file )
|
118
|
+
File.open( file, "w" ) do |file|
|
119
|
+
YAML::dump( self, file )
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Add ngrams to the current dictionary corresponding to the words found in
|
124
|
+
# the specified file. The file should contain one word per line and
|
125
|
+
# (ideally) only use alpha characters.
|
126
|
+
def add_from_file( file )
|
127
|
+
File.open( file, "r" ) do |file|
|
128
|
+
file.each { |line| add_from_word( line.chomp.downcase ) }
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Add ngrams to the current dictionary using the given word as a source.
|
133
|
+
def add_from_word( word )
|
134
|
+
2.upto( 3 ) do |n|
|
135
|
+
ngrams = word.ngrams( n )
|
136
|
+
|
137
|
+
unless ngrams.size == 0
|
138
|
+
inc( :first, n, ngrams.first )
|
139
|
+
ngrams.each { |ngram| inc( :any, n, ngram ) }
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Used to build the reverse index and trees that are used to by the
|
145
|
+
# random selection and walk code. If using a new dictionary (rather than
|
146
|
+
# a dictionary obtained via #load) call this before using #word, #ngram, or
|
147
|
+
# #next_char
|
148
|
+
def build_indices
|
149
|
+
build_reverse_index
|
150
|
+
build_walk_tree
|
151
|
+
end
|
152
|
+
|
153
|
+
private
|
154
|
+
def build_reverse_index
|
155
|
+
init_reverse_index
|
156
|
+
[ [:first,2], [:first,3], [:any,2], [:any,3] ].each do |type,n|
|
157
|
+
accumulator = 0
|
158
|
+
@ngrams[type][n].each do |ngram,score|
|
159
|
+
accumulator += score
|
160
|
+
@ridx[type][n] = @ridx[type][n] << [accumulator,ngram]
|
161
|
+
end
|
162
|
+
@sigma[type][n] = @ngrams[type][n].inject( 0 ) { |injection, element| injection+element.last }
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def build_walk_tree
|
167
|
+
init_walk_tree
|
168
|
+
@ngrams[:any][3].each do |ngram,score|
|
169
|
+
a, b, c = ngram.unpack( "aaa" )
|
170
|
+
@walk[a][b][0] += score
|
171
|
+
@walk[a][b][1] << [@walk[a][b][0],c]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def init_reverse_index
|
176
|
+
@ridx = {
|
177
|
+
:first => {
|
178
|
+
2 => [],
|
179
|
+
3 => []
|
180
|
+
},
|
181
|
+
:any => {
|
182
|
+
2 => [],
|
183
|
+
3 => []
|
184
|
+
}
|
185
|
+
}
|
186
|
+
@sigma = {
|
187
|
+
:first => {
|
188
|
+
2 => 0,
|
189
|
+
3 => 0
|
190
|
+
},
|
191
|
+
:any => {
|
192
|
+
2 => 0,
|
193
|
+
3 => 0
|
194
|
+
}
|
195
|
+
}
|
196
|
+
end
|
197
|
+
|
198
|
+
def init_walk_tree
|
199
|
+
@walk = {}
|
200
|
+
('a'..'z').each do |a|
|
201
|
+
@walk[a] = {}
|
202
|
+
('a'..'z').each do |b|
|
203
|
+
@walk[a][b] = [0,[]]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def inc( type, n, ngram )
|
209
|
+
@ngrams[type][n][ngram] = @ngrams[type][n].has_key?( ngram ) ? @ngrams[type][n][ngram]+1 : 1
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
if __FILE__ == $0
|
216
|
+
if ARGV.length > 0
|
217
|
+
ngs = Ngram::Dictionary.new
|
218
|
+
ngs.parse_from_file( ARGV[0] )
|
219
|
+
ngs.store
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'ngrams'
|
2
|
+
include Ngram
|
3
|
+
|
4
|
+
module PwdGen
|
5
|
+
|
6
|
+
class PasswordGenerator
|
7
|
+
|
8
|
+
def initialize( file = Dictionary::DEFAULT_STORE )
|
9
|
+
@ngs = Dictionary.load( file )
|
10
|
+
end
|
11
|
+
|
12
|
+
def generate_n( n, length )
|
13
|
+
Array.new( n, nil ).map { |_| generate( length ) }
|
14
|
+
end
|
15
|
+
|
16
|
+
def generate( length )
|
17
|
+
@ngs.word( length )
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#
|
2
|
+
# Extend standard library classes with methods required by Ngrams
|
3
|
+
#
|
4
|
+
if !String.respond_to? :ngrams
|
5
|
+
class String
|
6
|
+
# Return the result of splitting the string into an array of ngrams of length n.
|
7
|
+
def ngrams( n )
|
8
|
+
(0..self.length-n).to_a.collect { |idx| self[idx,n] }
|
9
|
+
end
|
10
|
+
end
|
11
|
+
else
|
12
|
+
raise "Cannot patch in String#ngrams as it is already defined!"
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: !int:Fixnum 1
|
4
|
+
name: ngrams
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-07-23 00:00:00 +01:00
|
8
|
+
summary: A library for manipulating bigrams and trigrams to generate pronouncable words.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: self@mattmower.com
|
12
|
+
homepage: http://rubyforge.org/projects/ngrams/
|
13
|
+
rubyforge_project: ngrams
|
14
|
+
description:
|
15
|
+
autorequire: ngrams
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Matt Mower
|
31
|
+
files:
|
32
|
+
- lib/ngrams.rb
|
33
|
+
- lib/ngrams/ngrams.rb
|
34
|
+
- lib/ngrams/pwdgen.rb
|
35
|
+
- lib/ngrams/stdlib_ext.rb
|
36
|
+
- data/ngrams.yml
|
37
|
+
- bin/genpasswd
|
38
|
+
- bin/ngramtool
|
39
|
+
- LICENSE
|
40
|
+
test_files: []
|
41
|
+
|
42
|
+
rdoc_options: []
|
43
|
+
|
44
|
+
extra_rdoc_files: []
|
45
|
+
|
46
|
+
executables:
|
47
|
+
- genpasswd
|
48
|
+
- ngramtool
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
dependencies: []
|
54
|
+
|