ngrams 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/LICENSE +9 -0
- data/bin/genpasswd +41 -0
- data/bin/ngramtool +87 -0
- data/data/ngrams.yml +56576 -0
- data/lib/ngrams.rb +2 -0
- data/lib/ngrams/ngrams.rb +221 -0
- data/lib/ngrams/pwdgen.rb +22 -0
- data/lib/ngrams/stdlib_ext.rb +13 -0
- metadata +54 -0
data/lib/ngrams.rb
ADDED
@@ -0,0 +1,221 @@
|
|
1
|
+
=begin rdoc
|
2
|
+
Ngrams - Copyright (c) 2006 Matt Mower <self@mattmower.com>
|
3
|
+
|
4
|
+
Released under the MIT license (see LICENSE file in the distribution)
|
5
|
+
|
6
|
+
The Ngram library was written as a way of mucking about with bigram and trigram
|
7
|
+
analysis of English words. It was inspired by {Tom Van Vlecks GPW program}[http://www.multicians.org/thvv/tvvtools.html] that uses trigrams to produce pronounceable passwords and an updated version, genpasswd, is included with the library. It addresses the one short-coming of Van Vlecks program which was that it didn't consider trigrams at the beginning of words
|
8
|
+
separately to those occurring anywhere in a word.
|
9
|
+
|
10
|
+
All the real work here is being done by the #Dictionary class which includes methods for parsing ngrams from words and files, for indexing them for fast lookup, and to return random selections based upon frequency of occurence.
|
11
|
+
|
12
|
+
Possible improvements to this library could include a means to bias the frequency analysis for example to make less common combinations occur more frequently (effectively inverting the probability of occurrence).
|
13
|
+
|
14
|
+
Two command line utilies are supplied:
|
15
|
+
|
16
|
+
ngramtool - to build stores from a dictionary file and extract ngrams from a store.
|
17
|
+
|
18
|
+
genpasswd - to creates random pronounceable passwords using trigrams
|
19
|
+
|
20
|
+
See --help for each tool
|
21
|
+
|
22
|
+
The ngrams library comes with a store pre-built (using the standard MacOSX dictionary file) which should be sufficent for most purposes. To use a different dictionary build a new store and pass the location of the store when initializing Dictionary.
|
23
|
+
=end
|
24
|
+
require 'YAML'
|
25
|
+
|
26
|
+
require 'ngrams/stdlib_ext'
|
27
|
+
|
28
|
+
module Ngram
|
29
|
+
|
30
|
+
# The Dictionary holds an indexed collection of bigrams (2-letter combinations) and
|
31
|
+
# trigrams (3-letter combinations) extracted from a dictionary of words.
|
32
|
+
#
|
33
|
+
# Example usage:
|
34
|
+
# dict = Dictionary.load
|
35
|
+
# word = dict.ngram( :first, 3 )
|
36
|
+
# 5.times { word << dict.next_char( word ) }
|
37
|
+
# puts word
|
38
|
+
#
|
39
|
+
# of course a simpler way to achieve the same would be to use dict.word(8)
|
40
|
+
#
|
41
|
+
class Dictionary
|
42
|
+
attr_accessor :ngrams, :ridx, :walk
|
43
|
+
|
44
|
+
DEFAULT_STORE = File.join( File.dirname( __FILE__ ), '..', '..', 'data', 'ngrams.yml' )
|
45
|
+
|
46
|
+
# Return an Dictionary instance initialized using the YAML data in the specified file.
|
47
|
+
def self.load( file = DEFAULT_STORE )
|
48
|
+
File.open( file ) { |file| YAML::load( file ) }
|
49
|
+
end
|
50
|
+
|
51
|
+
# Initialize a new, empty, Dictionary.
|
52
|
+
#
|
53
|
+
# Use #add_from_file or #add_from_word to load new ngrams into the dictionary. Once
|
54
|
+
# all words have been loaded call #build_indices to ready the dictionary for use and
|
55
|
+
# #store to save it to disk.
|
56
|
+
def initialize
|
57
|
+
@ngrams = {
|
58
|
+
:first => {
|
59
|
+
2 => Hash.new( 0 ),
|
60
|
+
3 => Hash.new( 0 )
|
61
|
+
},
|
62
|
+
:any => {
|
63
|
+
2 => Hash.new( 0 ),
|
64
|
+
3 => Hash.new( 0 )
|
65
|
+
}
|
66
|
+
}
|
67
|
+
|
68
|
+
init_reverse_index
|
69
|
+
init_walk_tree
|
70
|
+
end
|
71
|
+
|
72
|
+
# Returns a randomly selected 2 or 3 character ngram string
|
73
|
+
#
|
74
|
+
# Specifying type :first will select only ngrams that appear at the beginning of words
|
75
|
+
# from the source dictonary. Type :any will select ngrams that appear anywhere in a word.
|
76
|
+
#
|
77
|
+
# length can be either 2 (bigram) or 3 (trigram)
|
78
|
+
#
|
79
|
+
# The Dictionary tracks the frequency of each ngram and the random selection is weighted
|
80
|
+
# such that the probability of any ngram being selected is proportional to its frequency
|
81
|
+
# in the source dictionary.
|
82
|
+
def ngram( type, length )
|
83
|
+
r = Integer( @sigma[type][length] * rand )
|
84
|
+
@ridx[type][length].detect { |sum,_| sum >= r }.last.dup
|
85
|
+
end
|
86
|
+
|
87
|
+
# Returns a randomly selected character to follow the input. Repeated calls to this method
|
88
|
+
# implement a random-walk through the ngrams in the dictionary given a specified starting point.
|
89
|
+
#
|
90
|
+
# Either supply a string parameter containing a word for completion or two
|
91
|
+
# single characters. The following calls are equivalent:
|
92
|
+
#
|
93
|
+
# next_char( 'a', 'b' )
|
94
|
+
# next_char( 'ab' )
|
95
|
+
#
|
96
|
+
# In both cases the call will return a randomly selected character to follow the specified
|
97
|
+
# characters. The Dictionary tracks the frequency of each ngram and the random selection
|
98
|
+
# is weighted such that the probability of any following character being selected is proportional
|
99
|
+
# to the frequency with which it follows the specified characters in the source dictionary.
|
100
|
+
def next_char( a, b = nil )
|
101
|
+
if b.nil?
|
102
|
+
a, b = a[-2,1], a[-1,1]
|
103
|
+
end
|
104
|
+
r = Integer( @walk[a][b].first * rand )
|
105
|
+
@walk[a][b].last.detect { |sum,c| sum >= r }.last.dup
|
106
|
+
end
|
107
|
+
|
108
|
+
# Returns a word created by selecting a starting ngram and then doing a random walk
|
109
|
+
# to add the remaining characters to the specified length.
|
110
|
+
def word( length )
|
111
|
+
s = ngram( :first, 3 )
|
112
|
+
( length - 3 ).times { s << next_char( s ) }
|
113
|
+
s
|
114
|
+
end
|
115
|
+
|
116
|
+
# Store the Ngram dictionary and indices to a file using YAML
|
117
|
+
def save( file )
|
118
|
+
File.open( file, "w" ) do |file|
|
119
|
+
YAML::dump( self, file )
|
120
|
+
end
|
121
|
+
end
|
122
|
+
|
123
|
+
# Add ngrams to the current dictionary corresponding to the words found in
|
124
|
+
# the specified file. The file should contain one word per line and
|
125
|
+
# (ideally) only use alpha characters.
|
126
|
+
def add_from_file( file )
|
127
|
+
File.open( file, "r" ) do |file|
|
128
|
+
file.each { |line| add_from_word( line.chomp.downcase ) }
|
129
|
+
end
|
130
|
+
end
|
131
|
+
|
132
|
+
# Add ngrams to the current dictionary using the given word as a source.
|
133
|
+
def add_from_word( word )
|
134
|
+
2.upto( 3 ) do |n|
|
135
|
+
ngrams = word.ngrams( n )
|
136
|
+
|
137
|
+
unless ngrams.size == 0
|
138
|
+
inc( :first, n, ngrams.first )
|
139
|
+
ngrams.each { |ngram| inc( :any, n, ngram ) }
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
143
|
+
|
144
|
+
# Used to build the reverse index and trees that are used to by the
|
145
|
+
# random selection and walk code. If using a new dictionary (rather than
|
146
|
+
# a dictionary obtained via #load) call this before using #word, #ngram, or
|
147
|
+
# #next_char
|
148
|
+
def build_indices
|
149
|
+
build_reverse_index
|
150
|
+
build_walk_tree
|
151
|
+
end
|
152
|
+
|
153
|
+
private
|
154
|
+
def build_reverse_index
|
155
|
+
init_reverse_index
|
156
|
+
[ [:first,2], [:first,3], [:any,2], [:any,3] ].each do |type,n|
|
157
|
+
accumulator = 0
|
158
|
+
@ngrams[type][n].each do |ngram,score|
|
159
|
+
accumulator += score
|
160
|
+
@ridx[type][n] = @ridx[type][n] << [accumulator,ngram]
|
161
|
+
end
|
162
|
+
@sigma[type][n] = @ngrams[type][n].inject( 0 ) { |injection, element| injection+element.last }
|
163
|
+
end
|
164
|
+
end
|
165
|
+
|
166
|
+
def build_walk_tree
|
167
|
+
init_walk_tree
|
168
|
+
@ngrams[:any][3].each do |ngram,score|
|
169
|
+
a, b, c = ngram.unpack( "aaa" )
|
170
|
+
@walk[a][b][0] += score
|
171
|
+
@walk[a][b][1] << [@walk[a][b][0],c]
|
172
|
+
end
|
173
|
+
end
|
174
|
+
|
175
|
+
def init_reverse_index
|
176
|
+
@ridx = {
|
177
|
+
:first => {
|
178
|
+
2 => [],
|
179
|
+
3 => []
|
180
|
+
},
|
181
|
+
:any => {
|
182
|
+
2 => [],
|
183
|
+
3 => []
|
184
|
+
}
|
185
|
+
}
|
186
|
+
@sigma = {
|
187
|
+
:first => {
|
188
|
+
2 => 0,
|
189
|
+
3 => 0
|
190
|
+
},
|
191
|
+
:any => {
|
192
|
+
2 => 0,
|
193
|
+
3 => 0
|
194
|
+
}
|
195
|
+
}
|
196
|
+
end
|
197
|
+
|
198
|
+
def init_walk_tree
|
199
|
+
@walk = {}
|
200
|
+
('a'..'z').each do |a|
|
201
|
+
@walk[a] = {}
|
202
|
+
('a'..'z').each do |b|
|
203
|
+
@walk[a][b] = [0,[]]
|
204
|
+
end
|
205
|
+
end
|
206
|
+
end
|
207
|
+
|
208
|
+
def inc( type, n, ngram )
|
209
|
+
@ngrams[type][n][ngram] = @ngrams[type][n].has_key?( ngram ) ? @ngrams[type][n][ngram]+1 : 1
|
210
|
+
end
|
211
|
+
end
|
212
|
+
|
213
|
+
end
|
214
|
+
|
215
|
+
if __FILE__ == $0
|
216
|
+
if ARGV.length > 0
|
217
|
+
ngs = Ngram::Dictionary.new
|
218
|
+
ngs.parse_from_file( ARGV[0] )
|
219
|
+
ngs.store
|
220
|
+
end
|
221
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
require 'ngrams'
|
2
|
+
include Ngram
|
3
|
+
|
4
|
+
module PwdGen
|
5
|
+
|
6
|
+
class PasswordGenerator
|
7
|
+
|
8
|
+
def initialize( file = Dictionary::DEFAULT_STORE )
|
9
|
+
@ngs = Dictionary.load( file )
|
10
|
+
end
|
11
|
+
|
12
|
+
def generate_n( n, length )
|
13
|
+
Array.new( n, nil ).map { |_| generate( length ) }
|
14
|
+
end
|
15
|
+
|
16
|
+
def generate( length )
|
17
|
+
@ngs.word( length )
|
18
|
+
end
|
19
|
+
|
20
|
+
end
|
21
|
+
|
22
|
+
end
|
@@ -0,0 +1,13 @@
|
|
1
|
+
#
|
2
|
+
# Extend standard library classes with methods required by Ngrams
|
3
|
+
#
|
4
|
+
if !String.respond_to? :ngrams
|
5
|
+
class String
|
6
|
+
# Return the result of splitting the string into an array of ngrams of length n.
|
7
|
+
def ngrams( n )
|
8
|
+
(0..self.length-n).to_a.collect { |idx| self[idx,n] }
|
9
|
+
end
|
10
|
+
end
|
11
|
+
else
|
12
|
+
raise "Cannot patch in String#ngrams as it is already defined!"
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
rubygems_version: 0.9.0
|
3
|
+
specification_version: !int:Fixnum 1
|
4
|
+
name: ngrams
|
5
|
+
version: !ruby/object:Gem::Version
|
6
|
+
version: 0.1.0
|
7
|
+
date: 2006-07-23 00:00:00 +01:00
|
8
|
+
summary: A library for manipulating bigrams and trigrams to generate pronouncable words.
|
9
|
+
require_paths:
|
10
|
+
- lib
|
11
|
+
email: self@mattmower.com
|
12
|
+
homepage: http://rubyforge.org/projects/ngrams/
|
13
|
+
rubyforge_project: ngrams
|
14
|
+
description:
|
15
|
+
autorequire: ngrams
|
16
|
+
default_executable:
|
17
|
+
bindir: bin
|
18
|
+
has_rdoc: true
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
20
|
+
requirements:
|
21
|
+
- - ">"
|
22
|
+
- !ruby/object:Gem::Version
|
23
|
+
version: 0.0.0
|
24
|
+
version:
|
25
|
+
platform: ruby
|
26
|
+
signing_key:
|
27
|
+
cert_chain:
|
28
|
+
post_install_message:
|
29
|
+
authors:
|
30
|
+
- Matt Mower
|
31
|
+
files:
|
32
|
+
- lib/ngrams.rb
|
33
|
+
- lib/ngrams/ngrams.rb
|
34
|
+
- lib/ngrams/pwdgen.rb
|
35
|
+
- lib/ngrams/stdlib_ext.rb
|
36
|
+
- data/ngrams.yml
|
37
|
+
- bin/genpasswd
|
38
|
+
- bin/ngramtool
|
39
|
+
- LICENSE
|
40
|
+
test_files: []
|
41
|
+
|
42
|
+
rdoc_options: []
|
43
|
+
|
44
|
+
extra_rdoc_files: []
|
45
|
+
|
46
|
+
executables:
|
47
|
+
- genpasswd
|
48
|
+
- ngramtool
|
49
|
+
extensions: []
|
50
|
+
|
51
|
+
requirements: []
|
52
|
+
|
53
|
+
dependencies: []
|
54
|
+
|