correct-horse-battery-staple 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (61) hide show
  1. data.tar.gz.sig +1 -1
  2. data/.gemtest +0 -0
  3. data/Gemfile +53 -0
  4. data/Gemfile.lock +109 -0
  5. data/History.txt +6 -0
  6. data/Manifest.txt +57 -0
  7. data/README.txt +115 -0
  8. data/Rakefile +47 -0
  9. data/bin/chbs +234 -0
  10. data/bin/chbs-mkpass +16 -0
  11. data/correct-horse-battery-staple.gemspec +59 -0
  12. data/lib/correct_horse_battery_staple.rb +117 -0
  13. data/lib/correct_horse_battery_staple/assembler.rb +45 -0
  14. data/lib/correct_horse_battery_staple/backend.rb +6 -0
  15. data/lib/correct_horse_battery_staple/backend/isam_kd.rb +410 -0
  16. data/lib/correct_horse_battery_staple/backend/redis.rb +95 -0
  17. data/lib/correct_horse_battery_staple/backend/redis/d_range.rb +105 -0
  18. data/lib/correct_horse_battery_staple/corpus.rb +33 -0
  19. data/lib/correct_horse_battery_staple/corpus/base.rb +278 -0
  20. data/lib/correct_horse_battery_staple/corpus/isam.rb +258 -0
  21. data/lib/correct_horse_battery_staple/corpus/isam_kd.rb +60 -0
  22. data/lib/correct_horse_battery_staple/corpus/redis.rb +188 -0
  23. data/lib/correct_horse_battery_staple/corpus/redis2.rb +88 -0
  24. data/lib/correct_horse_battery_staple/corpus/serialized.rb +121 -0
  25. data/lib/correct_horse_battery_staple/corpus/sqlite.rb +266 -0
  26. data/lib/correct_horse_battery_staple/generator.rb +40 -0
  27. data/lib/correct_horse_battery_staple/memoize.rb +25 -0
  28. data/lib/correct_horse_battery_staple/parser.rb +5 -0
  29. data/lib/correct_horse_battery_staple/parser/base.rb +5 -0
  30. data/lib/correct_horse_battery_staple/parser/regex.rb +58 -0
  31. data/lib/correct_horse_battery_staple/range_parser.rb +29 -0
  32. data/lib/correct_horse_battery_staple/statistical_array.rb +74 -0
  33. data/lib/correct_horse_battery_staple/stats.rb +22 -0
  34. data/lib/correct_horse_battery_staple/word.rb +90 -0
  35. data/lib/correct_horse_battery_staple/writer.rb +29 -0
  36. data/lib/correct_horse_battery_staple/writer/base.rb +22 -0
  37. data/lib/correct_horse_battery_staple/writer/csv.rb +15 -0
  38. data/lib/correct_horse_battery_staple/writer/file.rb +54 -0
  39. data/lib/correct_horse_battery_staple/writer/isam.rb +50 -0
  40. data/lib/correct_horse_battery_staple/writer/isam_kd.rb +12 -0
  41. data/lib/correct_horse_battery_staple/writer/json.rb +19 -0
  42. data/lib/correct_horse_battery_staple/writer/marshal.rb +10 -0
  43. data/lib/correct_horse_battery_staple/writer/redis.rb +41 -0
  44. data/lib/correct_horse_battery_staple/writer/sqlite.rb +115 -0
  45. data/script/generate_all +34 -0
  46. data/script/load_redis +17 -0
  47. data/script/perftest +74 -0
  48. data/spec/corpus/serialized_spec.rb +62 -0
  49. data/spec/corpus_spec.rb +50 -0
  50. data/spec/correct_horse_battery_staple_spec.rb +73 -0
  51. data/spec/fixtures/100.json +101 -0
  52. data/spec/fixtures/corpus1.csv +101 -0
  53. data/spec/fixtures/corpus100.json +101 -0
  54. data/spec/fixtures/wiktionary1000.htm +648 -0
  55. data/spec/range_parser_spec.rb +54 -0
  56. data/spec/spec_helper.rb +20 -0
  57. data/spec/statistical_array_spec.rb +52 -0
  58. data/spec/support/spec_pry.rb +1 -0
  59. data/spec/word_spec.rb +95 -0
  60. metadata +264 -0
  61. metadata.gz.sig +1 -0
@@ -0,0 +1,40 @@
1
+
2
+ require 'securerandom'
3
+
4
+ #
5
+ # Generate an N-word passphrase from a corpus
6
+ #
7
+ class CorrectHorseBatteryStaple::Generator
8
+ include CorrectHorseBatteryStaple::Common
9
+ include CorrectHorseBatteryStaple::Memoize
10
+
11
+ attr_accessor :word_length, :corpus
12
+
13
+ def initialize(corpus, word_length = nil)
14
+ @corpus = corpus
15
+ if word_length
16
+ @corpus.filter {|entry| word_length.include?(entry.word.to_s.length) }
17
+ end
18
+ end
19
+
20
+ def make(count = 4, options = {})
21
+ @corpus.pick(count, options).
22
+ map {|entry| entry.word.downcase }.
23
+ join("-")
24
+ end
25
+
26
+ def estimate_entropy(options)
27
+ candidate_count = @corpus.count_candidates(options)
28
+ (log(candidate_count) / log(2)).floor
29
+ end
30
+ memoize :estimate_entropy
31
+
32
+ def words
33
+ @words ||= @corpus.result
34
+ end
35
+ end
36
+
37
+ if __FILE__ == $0
38
+ puts CorrectHorseBatteryStaple::Generator.new(CorrectHorseBatteryStaple.default_corpus, 3..6).
39
+ make((ARGV[0] || 4).to_i)
40
+ end
@@ -0,0 +1,25 @@
1
+ module CorrectHorseBatteryStaple::Memoize
2
+ def self.included(base)
3
+ base.extend ClassMethods
4
+ end
5
+
6
+ module ClassMethods
7
+ def memoize(method)
8
+ old_method = "_#{method}_unmemoized".to_sym
9
+ miss_object = Object.new
10
+ alias_method old_method, method
11
+ define_method method do |*args, &block|
12
+ @_memoize_cache ||= {}
13
+ methcache = (@_memoize_cache[method] ||= {})
14
+ if block
15
+ raise ArgumentError, "You cannot call a memoized method with a block! #{method}"
16
+ end
17
+ value = methcache.fetch(args, miss_object)
18
+ if value === miss_object
19
+ value = methcache[args] = send(old_method, *args)
20
+ end
21
+ value
22
+ end
23
+ end
24
+ end
25
+ end
@@ -0,0 +1,5 @@
1
+ class CorrectHorseBatteryStaple::Parser
2
+ autoload :Base,'correct_horse_battery_staple/parser/base'
3
+ autoload :Regex,'correct_horse_battery_staple/parser/regex'
4
+ end
5
+
@@ -0,0 +1,5 @@
1
+ class CorrectHorseBatteryStaple::Parser
2
+ class Base < CorrectHorseBatteryStaple::Parser
3
+ include CorrectHorseBatteryStaple::Common
4
+ end
5
+ end
@@ -0,0 +1,58 @@
1
+
2
+ class CorrectHorseBatteryStaple::Parser
3
+ class Regex < Base
4
+ PARSERS = {
5
+ :wiktionary => [%r{<a href="/wiki/\w+" title="(\w+)">\w+</a> = (\d+)},
6
+ lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[0], :frequency => match[1].to_i) }],
7
+
8
+ # rank lemma PoS freq dispersion
9
+ # 7 to t 6332195 0.98
10
+ :wordfrequency => [ %r{^(\d+)\s+(\w+)\s+\w*\s+(\d+)\s+([0-9.]+)},
11
+ lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[1],
12
+ :rank => match[0].to_f,
13
+ :frequency => match[3].to_f,
14
+ :dispersion => match[4].to_f)
15
+ }],
16
+
17
+ # using tabs between columns
18
+ # freq word PoS # texts
19
+ # ----- ----- ----- -----
20
+ # 22995878 the at 169011
21
+ # 11239776 and cc 168844
22
+ :coca => [ %r{^(\d+)\s+(\w+)\s+\w*\s+(\d+)},
23
+ lambda {|match| CorrectHorseBatteryStaple::Word.new(:word => match[1],
24
+ :frequency => match[0].to_i,
25
+ :texts => match[2].to_i)
26
+ }],
27
+
28
+ # <tr>
29
+ # <td>25</td>
30
+ # <td><a href="/wiki/be" title="be">be</a></td>
31
+ # <td>191823</td>
32
+ # </tr>
33
+ :tvscripts => [
34
+ Regexp.new('<tr>.*?<td>(\d+)</td>.*?<td>.*?title="(\w+)".*?</td>.*?<td>(\d+)</td>.*?</tr>', Regexp::MULTILINE),
35
+ lambda {|match| CorrectHorseBatteryStaple::Word.new(
36
+ :rank => match[0].to_i,
37
+ :word => match[1],
38
+ :frequency => match[2].to_i
39
+ ) }
40
+ ]
41
+ }
42
+
43
+ def initialize(type = :wiktionary)
44
+ @parser_type = type.to_sym
45
+ end
46
+
47
+ def parse(file)
48
+ raise ArgumentError, "unknown regex parser type #{@parser_type}" unless PARSERS.has_key?(@parser_type)
49
+ (regex, lexer) = PARSERS[@parser_type]
50
+
51
+ words =
52
+ file.read.scan(regex).map do |match|
53
+ lexer.call(match)
54
+ end
55
+
56
+ end
57
+ end
58
+ end
@@ -0,0 +1,29 @@
1
+ # doesn't handle X...Y
2
+
3
+ class CorrectHorseBatteryStaple::RangeParser
4
+ NUM = '-?(?:\.[0-9]+|[0-9]+|[0-9]+\.[0-9]+|[0-9]+\.(?!\.))'
5
+ SPACE = " *"
6
+ SEPARATOR = "(-|\\.\\.)"
7
+ REGEX_PAIR = Regexp.new("(#{NUM})#{SPACE}#{SEPARATOR}#{SPACE}(#{NUM})")
8
+ REGEX_SINGLE = Regexp.new("#{SPACE}(#{NUM})#{SPACE}")
9
+ def parse(string)
10
+ match = string.match(REGEX_PAIR)
11
+ if match
12
+ return Range.new(parse_number(match[1]), parse_number(match[3]))
13
+ end
14
+
15
+ match = string.match(REGEX_SINGLE)
16
+ if match
17
+ num = parse_number(match[0])
18
+ return Range.new(num, num)
19
+ end
20
+
21
+ nil
22
+ end
23
+
24
+ protected
25
+
26
+ def parse_number(str)
27
+ str.include?(".") ? str.to_f : str.to_i
28
+ end
29
+ end
@@ -0,0 +1,74 @@
1
+ module CorrectHorseBatteryStaple
2
+
3
+ class StatisticalArray
4
+ def initialize(array, sorted=false)
5
+ @obj = array
6
+ @sorted = sorted
7
+ end
8
+
9
+ def self.cast(array, sorted=false)
10
+ if array.is_a?(CorrectHorseBatteryStaple::StatisticalArray)
11
+ array
12
+ else
13
+ CorrectHorseBatteryStaple::StatisticalArray.new(array, sorted)
14
+ end
15
+ end
16
+
17
+ def sort!
18
+ @obj = @obj.sort unless @sorted
19
+ @sorted = true
20
+ self
21
+ end
22
+
23
+ def sort_by!(&block)
24
+ @obj = @obj.sort_by(&block)
25
+ @sorted = true
26
+ self
27
+ end
28
+
29
+ def method_missing(name, *args, &block)
30
+ @obj.__send__(name, *args, &block)
31
+ end
32
+
33
+ def mean
34
+ inject(0) { |sum, x| sum += x } / size.to_f
35
+ end
36
+ alias :average :mean
37
+
38
+ def sum
39
+ reduce(:+)
40
+ end
41
+
42
+ def sort(&block)
43
+ return super(&block) if block || !@sorted
44
+ self
45
+ end
46
+
47
+ def standard_deviation(m = mean)
48
+ variance = inject(0) { |v, x| v += (x - m) ** 2 }
49
+ return Math.sqrt(variance/(size-1))
50
+ end
51
+
52
+ def mean_and_standard_deviation
53
+ return m=mean, standard_deviation(m)
54
+ end
55
+
56
+ def percentile_index(percentile, round=true)
57
+ r = percentile.to_f/100 * length + 0.5
58
+ round ? r.round : r
59
+ end
60
+
61
+ def index_range_for_percentile(range)
62
+ range = Range.new(range - 0.5, range + 0.5) if range.is_a?(Numeric)
63
+ sort!
64
+
65
+ (percentile_index(range.begin, false).floor ..
66
+ percentile_index(range.end, false).ceil)
67
+ end
68
+
69
+ def select_percentile(range)
70
+ slice(index_range_for_percentile(range))
71
+ end
72
+
73
+ end
74
+ end
@@ -0,0 +1,22 @@
1
+ require 'ostruct'
2
+ require 'json'
3
+
4
+ class CorrectHorseBatteryStaple::Stats < OpenStruct
5
+ def to_hash
6
+ marshal_dump
7
+ end
8
+
9
+ def self.from_hash(hash)
10
+ new.tap do |newobj|
11
+ marshal_load(hash)
12
+ end
13
+ end
14
+
15
+ def to_json
16
+ to_hash.to_json
17
+ end
18
+
19
+ def self.from_json(json)
20
+ from_hash JSON.parse(json)
21
+ end
22
+ end
@@ -0,0 +1,90 @@
1
+ class CorrectHorseBatteryStaple::Word
2
+ # text of word
3
+ attr_accessor :word
4
+
5
+ # frequency is the total count of the word in corpus
6
+ attr_accessor :frequency
7
+
8
+ # rank is the word position when sorted by frequency in entire corpus
9
+ # index is the index of the word in this (sub)corpus
10
+ attr_accessor :rank, :index
11
+
12
+ # dispersion is Juilland dispersion, the % of texts containing the
13
+ # word. texts is the # of texts containing the word.
14
+ attr_accessor :dispersion
15
+
16
+ # texts is the # of texts containing the word. this is not available
17
+ # for many frequency lists.
18
+ attr_accessor :texts
19
+
20
+ ## statistical measure of word position in sorted frequency list
21
+
22
+ # in which percentile does the word appear. this can be calculated
23
+ # from the array of words so is somewhat redundant here
24
+ attr_accessor :percentile
25
+
26
+ # this word's frequency's distance from mean frequency in stddevs;
27
+ # signed.
28
+ attr_accessor :distance
29
+
30
+ # probability is the chance of any given word in a text composed
31
+ # of the sum of (word*frequency) in the corpus being this word.
32
+ attr_accessor :probability
33
+
34
+ # distance_probability is the distance of this word's probability
35
+ # from the mean in stddev
36
+ attr_accessor :distance_probability
37
+
38
+ include Comparable
39
+
40
+ def initialize(value_map = {})
41
+ raise ArgumentError, "Must supply at least :word" unless value_map[:word] || value_map["word"]
42
+
43
+ # phasing this out
44
+ self.index = -1
45
+
46
+ case value_map
47
+ when Hash then update_from_hash(value_map)
48
+ when CorrectHorseBatteryStaple::Word then update_from_hash(value_map.to_hash)
49
+ else raise "Can't initialize Word from #{value_map.inspect}"
50
+ end
51
+ end
52
+
53
+ def <=>(other)
54
+ self.frequency <=> other.frequency
55
+ end
56
+
57
+ def to_json(*args)
58
+ to_hash.to_json(*args)
59
+ end
60
+
61
+ def to_s
62
+ self.word
63
+ end
64
+
65
+ def inspect
66
+ "CHBS::Word(#{self.to_hash.inspect})"
67
+ end
68
+
69
+ def to_hash
70
+ instance_variables.reduce({}) do |hash, key|
71
+ hash[key.to_s[1..-1]] = instance_variable_get(key)
72
+ hash
73
+ end
74
+ end
75
+
76
+ def update_from_hash(hash)
77
+ hash.each do |key, val|
78
+ self[key] = val unless key.to_s == "wstruct"
79
+ end
80
+ self
81
+ end
82
+
83
+ def [](attr)
84
+ send(attr.to_s)
85
+ end
86
+
87
+ def []=(attr, value)
88
+ send("#{attr}=", value)
89
+ end
90
+ end
@@ -0,0 +1,29 @@
1
+ class CorrectHorseBatteryStaple::Writer
2
+ def self.make_writer(dest, fformat, options = {})
3
+ fformat ||= CorrectHorseBatteryStaple::Corpus.format_for(dest)
4
+ raise ArgumentError, "Cannot determine file format for #{dest}" if !fformat || fformat.empty?
5
+
6
+ clazz = const_get(fformat.downcase.capitalize)
7
+ clazz.new(dest, options)
8
+ end
9
+
10
+ def self.write(corpus, dest, fformat, options = {})
11
+ writer = self.make_writer(dest, fformat, options)
12
+ begin
13
+ writer.write_corpus(corpus)
14
+ ensure
15
+ writer && writer.close
16
+ end
17
+ end
18
+
19
+ autoload :Base, "correct_horse_battery_staple/writer/base"
20
+ autoload :File, "correct_horse_battery_staple/writer/file"
21
+ autoload :Json, "correct_horse_battery_staple/writer/json"
22
+ autoload :Csv, "correct_horse_battery_staple/writer/csv"
23
+ autoload :Isam, "correct_horse_battery_staple/writer/isam"
24
+ autoload :Isamkd, "correct_horse_battery_staple/writer/isam_kd"
25
+ autoload :Marshal, "correct_horse_battery_staple/writer/marshal"
26
+ autoload :Sqlite, "correct_horse_battery_staple/writer/sqlite"
27
+ autoload :Redis, "correct_horse_battery_staple/writer/redis"
28
+ # autoload :KDTree, "correct_horse_battery_staple/writer/kdtree"
29
+ end
@@ -0,0 +1,22 @@
1
+ #
2
+ # Base class for all writers
3
+ #
4
+
5
+ class CorrectHorseBatteryStaple::Writer::Base < CorrectHorseBatteryStaple::Writer
6
+ include CorrectHorseBatteryStaple::Common
7
+
8
+ attr_accessor :dest, :options
9
+
10
+ def initialize(dest, options = {})
11
+ self.dest = dest
12
+ self.options = options
13
+ initialize_backend_variables if respond_to?(:initialize_backend_variables)
14
+ end
15
+
16
+ def write_corpus(corpus)
17
+ raise NotImplementedError, "#{self.class.name} is not a complete implementation"
18
+ end
19
+
20
+ def close
21
+ end
22
+ end
@@ -0,0 +1,15 @@
1
+ class CorrectHorseBatteryStaple::Writer::Csv < CorrectHorseBatteryStaple::Writer::File
2
+
3
+ def initialize(dest, options={})
4
+ super
5
+ end
6
+
7
+ def write_corpus(corpus)
8
+ puts "index,rank,word,frequency,percentile,distance,probability,distance_probability"
9
+ corpus.each_with_index do |w, index|
10
+ puts sprintf("%d,%d,\"%s\",%d,%.4f,%.6f,%.8f,%.8f\n",
11
+ index || -1, w.rank || -1, w.word, w.frequency || -1,
12
+ w.percentile || -1, w.distance || -1, w.probability || -1, w.distance_probability || -1)
13
+ end
14
+ end
15
+ end
@@ -0,0 +1,54 @@
1
+ #
2
+ # base class for file-based stores
3
+ #
4
+ #
5
+ class CorrectHorseBatteryStaple::Writer::File < CorrectHorseBatteryStaple::Writer::Base
6
+ attr_accessor :io
7
+
8
+ def initialize(dest, options = {})
9
+ super
10
+
11
+ @do_close = false
12
+ if dest.respond_to?(:write)
13
+ self.io = dest
14
+ else
15
+ if ["/dev/stdout", "-"].include?(dest)
16
+ self.io = STDOUT
17
+ else
18
+ self.io = open(dest, openmode)
19
+ @do_close = true
20
+ end
21
+ end
22
+ end
23
+
24
+ def close
25
+ return unless @do_close
26
+ self.io.close rescue nil
27
+ ensure
28
+ self.io = nil
29
+ @do_close = false
30
+ end
31
+
32
+ protected
33
+
34
+ def openmode
35
+ "w"
36
+ end
37
+
38
+ def <<(string)
39
+ self.io.write string
40
+ end
41
+
42
+ def print(string)
43
+ self.io.print string
44
+ end
45
+
46
+ def puts(string)
47
+ self.io.puts string
48
+ end
49
+
50
+ def write(string)
51
+ self.io.write string
52
+ end
53
+
54
+ end