gibberish_detector 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +3 -0
- data/.trained_data.yml +759 -0
- data/CREDITS.md +9 -0
- data/Gemfile +4 -0
- data/README.md +58 -0
- data/Rakefile +8 -0
- data/bad.txt +5 -0
- data/big.txt +128457 -0
- data/gibberish_detector.gemspec +23 -0
- data/good.txt +6 -0
- data/lib/gibberish_detector.rb +139 -0
- data/lib/gibberish_detector/version.rb +3 -0
- data/lib/string.rb +5 -0
- data/test/test_gibberish_detector.rb +13 -0
- metadata +89 -0
@@ -0,0 +1,23 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'gibberish_detector/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |s|
|
7
|
+
s.name = "gibberish_detector"
|
8
|
+
s.version = GibberishDetector::VERSION
|
9
|
+
s.summary = "Detect gibberish in strings"
|
10
|
+
s.date = "2014-02-17"
|
11
|
+
s.description = "This gem uses the Markov chain to study and analyze text and decide if it is gibberish or not."
|
12
|
+
s.authors = ["Michael Chittenden"]
|
13
|
+
s.email = ["mchitten@gmail.com"]
|
14
|
+
s.license = "MIT"
|
15
|
+
s.homepage = "https://github.com/mchitten/gibberish_detector"
|
16
|
+
|
17
|
+
s.files = `git ls-files`.split($/)
|
18
|
+
s.test_files = s.files.grep(%r{test|spec|features})
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_development_dependency "bundler", "~> 1.3"
|
22
|
+
s.add_development_dependency "rake"
|
23
|
+
end
|
data/good.txt
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
require 'yaml'
|
2
|
+
require 'gibberish_detector/version'
|
3
|
+
require 'string'
|
4
|
+
|
5
|
+
class GibberishDetectorException < Exception ; end
|
6
|
+
|
7
|
+
class GibberishDetector
|
8
|
+
ACCEPTED_CHARACTERS = 'abcdefghijklmnopqrstuvwxyz ';
|
9
|
+
DATA_FILE = File.join(File.dirname(__FILE__), '..', '.trained_data.yml')
|
10
|
+
|
11
|
+
class << self
|
12
|
+
def gibberish?(text, opts = {})
|
13
|
+
opts[:lib_path] ||= DATA_FILE
|
14
|
+
opts[:raw] ||= false
|
15
|
+
|
16
|
+
raise GibberishDetectorException, "Please run Gibberish.train! to build your trained data file." unless File.exist?(opts[:lib_path])
|
17
|
+
|
18
|
+
trained_library = YAML.load(File.open(opts[:lib_path]))
|
19
|
+
raise GibberishDetectorException, 'Please run Gibberish.train! to build your trained data file.' if trained_library.nil?
|
20
|
+
|
21
|
+
value = _averageTransitionProbability(text, trained_library[:matrix])
|
22
|
+
return value if opts[:raw] == true
|
23
|
+
|
24
|
+
return true if value <= trained_library[:threshold]
|
25
|
+
|
26
|
+
false
|
27
|
+
end
|
28
|
+
|
29
|
+
def train!(opts={})
|
30
|
+
opts[:big_text_file] = 'big.txt'
|
31
|
+
opts[:good_text_file] = 'good.txt'
|
32
|
+
opts[:bad_text_file] = 'bad.txt'
|
33
|
+
opts[:lib_path] = DATA_FILE
|
34
|
+
|
35
|
+
if File.exist?(opts[:big_text_file]) == false || File.exist?(opts[:good_text_file]) == false || File.exist?(opts[:bad_text_file]) == false
|
36
|
+
raise GibberishDetectorException, "We couldn't find one of #{opts[:big_text_file]}, #{opts[:good_text_file]} or #{opts[:bad_text_file]}. Please ensure all three files exist before training."
|
37
|
+
return false
|
38
|
+
end
|
39
|
+
|
40
|
+
k = ACCEPTED_CHARACTERS.length
|
41
|
+
hsh = {}
|
42
|
+
pos = ACCEPTED_CHARACTERS.dup.split('').each_with_index do |key, index|
|
43
|
+
hsh[key] = index
|
44
|
+
end.reverse
|
45
|
+
pos = hsh
|
46
|
+
|
47
|
+
log_prob_matrix = {}
|
48
|
+
range = (0...k).to_a
|
49
|
+
range.each do |index|
|
50
|
+
arr = {}
|
51
|
+
range.each do |index2|
|
52
|
+
arr[index2] = 10
|
53
|
+
end
|
54
|
+
|
55
|
+
log_prob_matrix[index] = arr
|
56
|
+
end
|
57
|
+
|
58
|
+
lines = File.open(opts[:big_text_file]).read
|
59
|
+
lines.each_line do |line|
|
60
|
+
filtered_line = normalize(line).split('')
|
61
|
+
a = false
|
62
|
+
filtered_line.each do |b|
|
63
|
+
if a != false
|
64
|
+
log_prob_matrix[pos[a]] ||= {}
|
65
|
+
log_prob_matrix[pos[a]][pos[b]] ||= 0
|
66
|
+
log_prob_matrix[pos[a]][pos[b]] += 1
|
67
|
+
end
|
68
|
+
a = b
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
log_prob_matrix.each do |i, row|
|
73
|
+
s = row.values.inject(:+).to_f
|
74
|
+
row.each do |k, j|
|
75
|
+
log_prob_matrix[i][k] = Math.log(j / s)
|
76
|
+
end
|
77
|
+
end
|
78
|
+
|
79
|
+
good_lines = File.open(opts[:good_text_file]).read
|
80
|
+
good_probs = []
|
81
|
+
good_lines.each_line do |line|
|
82
|
+
good_probs << _averageTransitionProbability(line.chomp, log_prob_matrix)
|
83
|
+
end
|
84
|
+
|
85
|
+
bad_lines = File.open(opts[:bad_text_file]).read
|
86
|
+
bad_probs = []
|
87
|
+
bad_lines.each_line do |line|
|
88
|
+
bad_probs << _averageTransitionProbability(line.chomp, log_prob_matrix)
|
89
|
+
end
|
90
|
+
|
91
|
+
min_good_probs = good_probs.min
|
92
|
+
max_bad_probs = bad_probs.max
|
93
|
+
|
94
|
+
if min_good_probs <= max_bad_probs
|
95
|
+
raise GibberishDetectorException, "The prob counts are invalid."
|
96
|
+
end
|
97
|
+
|
98
|
+
threshold = (min_good_probs + max_bad_probs) / 2
|
99
|
+
File.open(opts[:lib_path], 'w+') do |file|
|
100
|
+
data = {
|
101
|
+
:matrix => log_prob_matrix,
|
102
|
+
:threshold => threshold
|
103
|
+
}
|
104
|
+
|
105
|
+
file << data.to_yaml
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
private
|
110
|
+
def normalize(text)
|
111
|
+
text.downcase.gsub(/[^a-z\ ]/, '')
|
112
|
+
end
|
113
|
+
|
114
|
+
def _averageTransitionProbability(line, log_prob_matrix)
|
115
|
+
log_prob = 1.0
|
116
|
+
transition_ct = 0
|
117
|
+
|
118
|
+
hsh = {}
|
119
|
+
ACCEPTED_CHARACTERS.dup.split('').each_with_index do |key, index|
|
120
|
+
hsh[key] = index
|
121
|
+
end.reverse
|
122
|
+
pos = hsh
|
123
|
+
|
124
|
+
filtered_line = normalize(line.dup).split('')
|
125
|
+
a = false
|
126
|
+
filtered_line.each do |b|
|
127
|
+
if a != false
|
128
|
+
log_prob += log_prob_matrix[pos[a]][pos[b]]
|
129
|
+
transition_ct += 1
|
130
|
+
end
|
131
|
+
|
132
|
+
a = b
|
133
|
+
end
|
134
|
+
|
135
|
+
Math.exp(log_prob / [transition_ct, 1].max)
|
136
|
+
end
|
137
|
+
end
|
138
|
+
end
|
139
|
+
|
data/lib/string.rb
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
require 'test/unit'
|
2
|
+
require 'gibberish_detector'
|
3
|
+
|
4
|
+
class GibberishDetectorTest < Test::Unit::TestCase
|
5
|
+
def test_gibberish
|
6
|
+
gibberish = "asodfjasdf"
|
7
|
+
assert_equal gibberish.gibberish?, true
|
8
|
+
end
|
9
|
+
def test_non_gibberish
|
10
|
+
gibberish = "hello world"
|
11
|
+
assert_equal gibberish.gibberish?, false
|
12
|
+
end
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,89 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: gibberish_detector
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Michael Chittenden
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2014-02-17 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.3'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - '>='
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - '>='
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: This gem uses the Markov chain to study and analyze text and decide if
|
42
|
+
it is gibberish or not.
|
43
|
+
email:
|
44
|
+
- mchitten@gmail.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- .gitignore
|
50
|
+
- .trained_data.yml
|
51
|
+
- CREDITS.md
|
52
|
+
- Gemfile
|
53
|
+
- README.md
|
54
|
+
- Rakefile
|
55
|
+
- bad.txt
|
56
|
+
- big.txt
|
57
|
+
- gibberish_detector.gemspec
|
58
|
+
- good.txt
|
59
|
+
- lib/gibberish_detector.rb
|
60
|
+
- lib/gibberish_detector/version.rb
|
61
|
+
- lib/string.rb
|
62
|
+
- test/test_gibberish_detector.rb
|
63
|
+
homepage: https://github.com/mchitten/gibberish_detector
|
64
|
+
licenses:
|
65
|
+
- MIT
|
66
|
+
metadata: {}
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - '>='
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - '>='
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 2.0.5
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: Detect gibberish in strings
|
87
|
+
test_files:
|
88
|
+
- gibberish_detector.gemspec
|
89
|
+
- test/test_gibberish_detector.rb
|