gibberish_detector 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,23 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gibberish_detector/version'
5
+
6
+ Gem::Specification.new do |s|
7
+ s.name = "gibberish_detector"
8
+ s.version = GibberishDetector::VERSION
9
+ s.summary = "Detect gibberish in strings"
10
+ s.date = "2014-02-17"
11
+ s.description = "This gem uses the Markov chain to study and analyze text and decide if it is gibberish or not."
12
+ s.authors = ["Michael Chittenden"]
13
+ s.email = ["mchitten@gmail.com"]
14
+ s.license = "MIT"
15
+ s.homepage = "https://github.com/mchitten/gibberish_detector"
16
+
17
+ s.files = `git ls-files`.split($/)
18
+ s.test_files = s.files.grep(%r{test|spec|features})
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "bundler", "~> 1.3"
22
+ s.add_development_dependency "rake"
23
+ end
data/good.txt ADDED
@@ -0,0 +1,6 @@
1
+ rob
2
+ two models
3
+ some long sentence, might suck?
4
+ Project Gutenberg
5
+ a b c
6
+
@@ -0,0 +1,139 @@
1
+ require 'yaml'
2
+ require 'gibberish_detector/version'
3
+ require 'string'
4
+
5
+ class GibberishDetectorException < Exception ; end
6
+
7
+ class GibberishDetector
8
+ ACCEPTED_CHARACTERS = 'abcdefghijklmnopqrstuvwxyz ';
9
+ DATA_FILE = File.join(File.dirname(__FILE__), '..', '.trained_data.yml')
10
+
11
+ class << self
12
+ def gibberish?(text, opts = {})
13
+ opts[:lib_path] ||= DATA_FILE
14
+ opts[:raw] ||= false
15
+
16
+ raise GibberishDetectorException, "Please run Gibberish.train! to build your trained data file." unless File.exist?(opts[:lib_path])
17
+
18
+ trained_library = YAML.load(File.open(opts[:lib_path]))
19
+ raise GibberishDetectorException, 'Please run Gibberish.train! to build your trained data file.' if trained_library.nil?
20
+
21
+ value = _averageTransitionProbability(text, trained_library[:matrix])
22
+ return value if opts[:raw] == true
23
+
24
+ return true if value <= trained_library[:threshold]
25
+
26
+ false
27
+ end
28
+
29
+ def train!(opts={})
30
+ opts[:big_text_file] = 'big.txt'
31
+ opts[:good_text_file] = 'good.txt'
32
+ opts[:bad_text_file] = 'bad.txt'
33
+ opts[:lib_path] = DATA_FILE
34
+
35
+ if File.exist?(opts[:big_text_file]) == false || File.exist?(opts[:good_text_file]) == false || File.exist?(opts[:bad_text_file]) == false
36
+ raise GibberishDetectorException, "We couldn't find one of #{opts[:big_text_file]}, #{opts[:good_text_file]} or #{opts[:bad_text_file]}. Please ensure all three files exist before training."
37
+ return false
38
+ end
39
+
40
+ k = ACCEPTED_CHARACTERS.length
41
+ hsh = {}
42
+ pos = ACCEPTED_CHARACTERS.dup.split('').each_with_index do |key, index|
43
+ hsh[key] = index
44
+ end.reverse
45
+ pos = hsh
46
+
47
+ log_prob_matrix = {}
48
+ range = (0...k).to_a
49
+ range.each do |index|
50
+ arr = {}
51
+ range.each do |index2|
52
+ arr[index2] = 10
53
+ end
54
+
55
+ log_prob_matrix[index] = arr
56
+ end
57
+
58
+ lines = File.open(opts[:big_text_file]).read
59
+ lines.each_line do |line|
60
+ filtered_line = normalize(line).split('')
61
+ a = false
62
+ filtered_line.each do |b|
63
+ if a != false
64
+ log_prob_matrix[pos[a]] ||= {}
65
+ log_prob_matrix[pos[a]][pos[b]] ||= 0
66
+ log_prob_matrix[pos[a]][pos[b]] += 1
67
+ end
68
+ a = b
69
+ end
70
+ end
71
+
72
+ log_prob_matrix.each do |i, row|
73
+ s = row.values.inject(:+).to_f
74
+ row.each do |k, j|
75
+ log_prob_matrix[i][k] = Math.log(j / s)
76
+ end
77
+ end
78
+
79
+ good_lines = File.open(opts[:good_text_file]).read
80
+ good_probs = []
81
+ good_lines.each_line do |line|
82
+ good_probs << _averageTransitionProbability(line.chomp, log_prob_matrix)
83
+ end
84
+
85
+ bad_lines = File.open(opts[:bad_text_file]).read
86
+ bad_probs = []
87
+ bad_lines.each_line do |line|
88
+ bad_probs << _averageTransitionProbability(line.chomp, log_prob_matrix)
89
+ end
90
+
91
+ min_good_probs = good_probs.min
92
+ max_bad_probs = bad_probs.max
93
+
94
+ if min_good_probs <= max_bad_probs
95
+ raise GibberishDetectorException, "The prob counts are invalid."
96
+ end
97
+
98
+ threshold = (min_good_probs + max_bad_probs) / 2
99
+ File.open(opts[:lib_path], 'w+') do |file|
100
+ data = {
101
+ :matrix => log_prob_matrix,
102
+ :threshold => threshold
103
+ }
104
+
105
+ file << data.to_yaml
106
+ end
107
+ end
108
+
109
+ private
110
+ def normalize(text)
111
+ text.downcase.gsub(/[^a-z\ ]/, '')
112
+ end
113
+
114
+ def _averageTransitionProbability(line, log_prob_matrix)
115
+ log_prob = 1.0
116
+ transition_ct = 0
117
+
118
+ hsh = {}
119
+ ACCEPTED_CHARACTERS.dup.split('').each_with_index do |key, index|
120
+ hsh[key] = index
121
+ end.reverse
122
+ pos = hsh
123
+
124
+ filtered_line = normalize(line.dup).split('')
125
+ a = false
126
+ filtered_line.each do |b|
127
+ if a != false
128
+ log_prob += log_prob_matrix[pos[a]][pos[b]]
129
+ transition_ct += 1
130
+ end
131
+
132
+ a = b
133
+ end
134
+
135
+ Math.exp(log_prob / [transition_ct, 1].max)
136
+ end
137
+ end
138
+ end
139
+
@@ -0,0 +1,3 @@
1
+ class GibberishDetector
2
+ VERSION = "1.0.0"
3
+ end
data/lib/string.rb ADDED
@@ -0,0 +1,5 @@
1
+ class String
2
+ def gibberish?
3
+ GibberishDetector.gibberish?(self)
4
+ end
5
+ end
@@ -0,0 +1,13 @@
1
+ require 'test/unit'
2
+ require 'gibberish_detector'
3
+
4
+ class GibberishDetectorTest < Test::Unit::TestCase
5
+ def test_gibberish
6
+ gibberish = "asodfjasdf"
7
+ assert_equal gibberish.gibberish?, true
8
+ end
9
+ def test_non_gibberish
10
+ gibberish = "hello world"
11
+ assert_equal gibberish.gibberish?, false
12
+ end
13
+ end
metadata ADDED
@@ -0,0 +1,89 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gibberish_detector
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Michael Chittenden
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2014-02-17 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: bundler
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ~>
18
+ - !ruby/object:Gem::Version
19
+ version: '1.3'
20
+ type: :development
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ~>
25
+ - !ruby/object:Gem::Version
26
+ version: '1.3'
27
+ - !ruby/object:Gem::Dependency
28
+ name: rake
29
+ requirement: !ruby/object:Gem::Requirement
30
+ requirements:
31
+ - - '>='
32
+ - !ruby/object:Gem::Version
33
+ version: '0'
34
+ type: :development
35
+ prerelease: false
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - '>='
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: This gem uses the Markov chain to study and analyze text and decide if
42
+ it is gibberish or not.
43
+ email:
44
+ - mchitten@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - .gitignore
50
+ - .trained_data.yml
51
+ - CREDITS.md
52
+ - Gemfile
53
+ - README.md
54
+ - Rakefile
55
+ - bad.txt
56
+ - big.txt
57
+ - gibberish_detector.gemspec
58
+ - good.txt
59
+ - lib/gibberish_detector.rb
60
+ - lib/gibberish_detector/version.rb
61
+ - lib/string.rb
62
+ - test/test_gibberish_detector.rb
63
+ homepage: https://github.com/mchitten/gibberish_detector
64
+ licenses:
65
+ - MIT
66
+ metadata: {}
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - '>='
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - '>='
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.0.5
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Detect gibberish in strings
87
+ test_files:
88
+ - gibberish_detector.gemspec
89
+ - test/test_gibberish_detector.rb