dejunk 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: 11a02210b2135091e658c20aea3611ccb4b22627
4
+ data.tar.gz: 6bd647976a92dff85ade9fca15f45848ed577118
5
+ SHA512:
6
+ metadata.gz: b6f54d2dd4a8ec45dd8209d34ee6bbc1eb6ba76819f25720cec74f3a0af4233eb71c4059dfe58b33ec668bec0a19a616eadf6a822648485a714b610254d338dc
7
+ data.tar.gz: 5e92d10677561fa1b2b3afdf4e6ba582098266c9a51a92053e7cc80eb13107a54cec524c1921fbf4ad46937b0f44e37e00b88533d3d5992fec08d3c7b07141e7
data/.gitignore ADDED
@@ -0,0 +1,9 @@
1
+ /.bundle/
2
+ /.yardoc
3
+ /Gemfile.lock
4
+ /_yardoc/
5
+ /coverage/
6
+ /doc/
7
+ /pkg/
8
+ /spec/reports/
9
+ /tmp/
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format documentation
2
+ --color
data/.travis.yml ADDED
@@ -0,0 +1,4 @@
1
+ language: ruby
2
+ rvm:
3
+ - 2.2.3
4
+ before_install: gem install bundler -v 1.10.6
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source 'https://rubygems.org'
2
+
3
+ # Specify your gem's dependencies in dejunk.gemspec
4
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,202 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "{}"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright {yyyy} {name of copyright owner}
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
202
+
data/README.md ADDED
@@ -0,0 +1,73 @@
1
+ # Dejunk
2
+
3
+ Detect keyboard mashing and other junk in your data.
4
+
5
+ For example, if you allow user-entered tags, but want to hide bad ones. Or if
6
+ you want to detect user frustration filling out a particular field, and do
7
+ something about it!
8
+
9
+ Uses a variety of heuristics, the most sophisticated being a comparison of
10
+ bigrams in the input to the frequencies in a "known-good" corpus vs. their
11
+ proximity on a keyboard. Achieves pretty good precision on Academia.edu's data,
12
+ but might need adjustment for yours.
13
+
14
+ ## Installation
15
+
16
+ Add this line to your application's Gemfile:
17
+
18
+ ```ruby
19
+ gem 'dejunk'
20
+ ```
21
+
22
+ And then execute:
23
+
24
+ $ bundle
25
+
26
+ Or install it yourself as:
27
+
28
+ $ gem install dejunk
29
+
30
+ ## Usage
31
+
32
+ The main interface is `Dejunk.is_junk?`. Pass a string, and get a truthy value
33
+ if it looks junky, and false otherwise.
34
+
35
+ ```ruby
36
+ $ Dejunk.is_junk?('Hello World')
37
+ => false
38
+ $ Dejunk.is_junk?('qwefqwef')
39
+ => :mashing_bigrams
40
+
41
+ $ Dejunk.is_junk?('asdf')
42
+ => :asdf_row
43
+ $ Dejunk.is_junk?('fads')
44
+ => false
45
+
46
+ $ Dejunk.is_junk?('Hi')
47
+ => :too_short
48
+ $ Dejunk.is_junk?('Hi', whitelist_regexes: [/\Ahi\z/i])
49
+ => false
50
+ ```
51
+
52
+ Returns a reason when junk is detected for aid in debugging. Optional parameters
53
+ are `min_alnum_chars` (defaults to 3), and `whitelist_strings` and
54
+ `whitelist_regexes` (both default to none).
55
+
56
+ ## Development
57
+
58
+ After checking out the repo, run `bin/setup` to install dependencies. Then, run
59
+ `rake spec` to run the tests. You can also run `bin/console` for an interactive
60
+ prompt that will allow you to experiment.
61
+
62
+ To install this gem onto your local machine, run `bundle exec rake install`.
63
+ To release a new version, update the version number in `version.rb`, and then run
64
+ `bundle exec rake release`, which will create a git tag for the version,
65
+ push git commits and tags, and push the `.gem` file to [rubygems.org](https://rubygems.org).
66
+
67
+ ## Contributing
68
+
69
+ Bug reports and pull requests are welcome on GitHub at https://github.com/academia-edu/dejunk
70
+
71
+ ## License
72
+
73
+ Apache 2.0
data/Rakefile ADDED
@@ -0,0 +1,6 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new(:spec)
5
+
6
+ task :default => :spec
data/bin/console ADDED
@@ -0,0 +1,14 @@
1
+ #!/usr/bin/env ruby
2
+
3
+ require "bundler/setup"
4
+ require "dejunk"
5
+
6
+ # You can add fixtures and/or initialization code here to make experimenting
7
+ # with your gem easier. You can also use a different console, if you like.
8
+
9
+ # (If you use this, don't forget to add pry to your Gemfile!)
10
+ # require "pry"
11
+ # Pry.start
12
+
13
+ require "irb"
14
+ IRB.start
data/bin/setup ADDED
@@ -0,0 +1,7 @@
1
+ #!/bin/bash
2
+ set -euo pipefail
3
+ IFS=$'\n\t'
4
+
5
+ bundle install
6
+
7
+ # Do any other automated setup that you need to do here
data/dejunk.gemspec ADDED
@@ -0,0 +1,25 @@
1
+ # coding: utf-8
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'dejunk/version'
5
+
6
+ Gem::Specification.new do |spec|
7
+ spec.name = "dejunk"
8
+ spec.version = Dejunk::VERSION
9
+ spec.authors = ["David Judd"]
10
+ spec.email = ["david@academia.edu"]
11
+
12
+ spec.summary = 'Detect keyboard mashing and other junk in your data.'
13
+ spec.homepage = 'https://github.com/academia-edu/dejunk'
14
+
15
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
16
+ spec.bindir = "exe"
17
+ spec.executables = spec.files.grep(%r{^exe/}) { |f| File.basename(f) }
18
+ spec.require_paths = ["lib"]
19
+
20
+ spec.add_dependency 'activesupport'
21
+
22
+ spec.add_development_dependency "bundler", "~> 1.10"
23
+ spec.add_development_dependency "rake", "~> 10.0"
24
+ spec.add_development_dependency "rspec"
25
+ end
data/lib/dejunk.rb ADDED
@@ -0,0 +1,270 @@
1
+ require "dejunk/version"
2
+ require "yaml"
3
+ require "active_support/core_ext/string"
4
+
5
+ module Dejunk
6
+ extend self
7
+
8
+ # All characters on the middle row of a QWERTY keyboard
9
+ MASH_CHARS = 'ASDFGHJKLasdfghjkl;: '
10
+
11
+ # All neighboring key pairs on a QWERTY keyboard, except "er" and "re" which
12
+ # each make up >1% of bigrams in our "good" sample, plus each letter repeated
13
+ # or with a space
14
+ MASH_BIGRAMS = (
15
+ ("abcdefghijklmnopqrstuvwxyz".chars.flat_map { |l| ["#{l} ", "#{l}#{l}"] }) +
16
+ %w( qw we rt ty yu ui op as sd df fg gh hj jk kl zx xd cv vb bn nm qa az ws sx ed dc rf fv tg gb yh hn uj jm ik ol )
17
+ ).flat_map { |bigram| [bigram, bigram.reverse] }.to_set.freeze
18
+
19
+ def is_junk?(string, min_alnum_chars: 3, whitelist_regexes: [], whitelist_strings: [])
20
+ if string && (whitelist_strings.include?(string) || whitelist_regexes.any? { |re| string =~ re })
21
+ return false
22
+ end
23
+
24
+ return :no_alpha if string.nil? || string !~ /[[:alpha:]]/
25
+
26
+ normed = normalize_for_comparison(string)
27
+
28
+ return :too_short if too_few_alphanumeric_chars?(normed, min_alnum_chars)
29
+ return :one_char_repeat if excessive_single_character_repeats?(string, normed)
30
+ return :starts_with_punct if starts_with_disallowed_punctuation?(string)
31
+ return :too_many_short_words if too_many_short_words?(string)
32
+ return :three_chars_repeat_twice if three_plus_chars_repeat_twice?(string)
33
+ return :fuck if string =~ /\bfuck/i
34
+ return :missing_vowels if missing_vowels?(string, normed)
35
+ return :asdf_row if asdf_row_and_suspicious?(string)
36
+
37
+ ascii_proportion = string.chars.count { |c| c.ord < 128 }.to_f / string.length
38
+
39
+ # The bigrams look like the ones you'd get from keyboard mashing
40
+ # (the probability shouldn't be taken too literally, > 0.25 is almost all
41
+ # mashing in practice on our corpus)
42
+ if string.length > 1 && ascii_proportion > 0.8
43
+ if probability_of_keyboard_mashing(string) > 0.25
44
+ return :mashing_bigrams
45
+ end
46
+ end
47
+
48
+ # The bigrams don't look like the bigrams in legitimate strings
49
+ if string.length > 6 && ascii_proportion > 0.8
50
+ corpus_similarity = bigram_similarity_to_corpus(string)
51
+
52
+ # The similarity is more accurate for longer strings, and with more ASCII,
53
+ # so increase the value (= lower the threshold) for shorter strings and
54
+ # strings with less ASCII.
55
+ score = corpus_similarity * (1.0/ascii_proportion**2) * (1.0/(1 - Math.exp(-0.1*string.length)))
56
+
57
+ if score < 0.03
58
+ return :unlikely_bigrams
59
+ elsif score < 0.08 && string !~ /\A([[:upper:]][[:lower:]]+ )*[[:upper:]][[:lower:]]+\z/
60
+ # The similarity ignores casing, so instead use a higher threshold if
61
+ # the casing looks wrong
62
+ return :unlikely_bigrams
63
+ elsif score < bigram_similarity_to_mashing(string)
64
+ return :mashing_bigrams
65
+ end
66
+ end
67
+
68
+ false
69
+ end
70
+
71
+ # Cosine similarity between vector of frequencies of bigrams within string,
72
+ # and vector of frequencies of all bigrams within corpus
73
+ def bigram_similarity_to_corpus(string)
74
+ bigrams = bigrams(string)
75
+
76
+ freqs = bigrams.
77
+ each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }.
78
+ each_with_object({}) do |(bigram,count), freqs|
79
+ freqs[bigram] = count.to_f / bigrams.length
80
+ end
81
+
82
+ numerator = freqs.
83
+ map{ |bigram, freq| corpus_bigram_frequencies[bigram].to_f * freq }.inject(&:+)
84
+ denominator = corpus_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5)
85
+
86
+ numerator / denominator
87
+ end
88
+
89
+ # Cosine similarity between vector of frequencies of bigrams within string,
90
+ # and vector which assumes all bigrams made of neighboring pairs on the keyboard
91
+ # are equally likely, and no others appear
92
+ def bigram_similarity_to_mashing(string)
93
+ bigrams = bigrams(string)
94
+
95
+ freqs = bigrams.
96
+ each_with_object(Hash.new(0)) { |bigram, counts| counts[bigram] += 1 }.
97
+ each_with_object({}) do |(bigram,count), freqs|
98
+ freqs[bigram] = count.to_f / bigrams.length
99
+ end
100
+
101
+ numerator = freqs.map{ |bigram, freq| freq * mashing_bigram_frequencies[bigram].to_f }.inject(&:+)
102
+ denominator = mashing_bigram_magnitude * ((freqs.values.map{ |v| v**2 }.inject(&:+)) ** 0.5)
103
+
104
+ numerator / denominator
105
+ end
106
+
107
+ def bigrams(string)
108
+ return [] if string.nil?
109
+
110
+ string = string.strip
111
+ return [] if string.length < 2
112
+
113
+ string.
114
+ chars.
115
+ zip(string.chars[1..-1]).
116
+ map { |c1,c2| "#{c1.mb_chars.downcase}#{c2.mb_chars.downcase}" if c1 && c2 }.
117
+ compact.
118
+ map { |bigram| bigram.gsub(/[0-9]/, '0'.freeze) }.
119
+ map { |bigram| bigram.gsub(/[[:space:]]/, ' '.freeze) }
120
+ end
121
+
122
+ # The Bayesian probability of a string being keyboard mashing, given the
123
+ # probability of each bigram if drawn either from the legit corpus or from
124
+ # mashing, and an a priori probability of mashing.
125
+ #
126
+ # The probability shouldn't be taken too literally, but it's a useful
127
+ # indicator.
128
+ def probability_of_keyboard_mashing(string, apriori_probability_of_mashing: 0.1)
129
+ bigrams = bigrams(string)
130
+
131
+ return 0 unless bigrams.present?
132
+
133
+ prob_bigrams_given_mashing = bigrams.
134
+ map { |bigram| BigDecimal.new(mashing_probability(bigram).to_s) }.
135
+ inject(&:*)
136
+
137
+ prob_bigrams_given_corpus = bigrams.
138
+ map { |bigram| BigDecimal.new(corpus_probability(bigram).to_s) }.
139
+ inject(&:*)
140
+
141
+ numerator = prob_bigrams_given_mashing * apriori_probability_of_mashing
142
+
143
+ numerator / (numerator + prob_bigrams_given_corpus * (1 - apriori_probability_of_mashing))
144
+ end
145
+
146
+ def normalize_for_comparison(string)
147
+ string.
148
+ mb_chars.
149
+ normalize(:kd).
150
+ gsub(/\p{Mn}+/, ''.freeze).
151
+ gsub(/[^[:alnum:]]+/, ''.freeze).
152
+ downcase
153
+ end
154
+
155
+ private
156
+
157
+ def missing_vowels?(string, normed)
158
+ # Missing vowels (and doesn't look like acronym, and is ASCII so we can tell)
159
+ unless normed.chars.any? { |c| c.ord >= 128 } || string == string.upcase
160
+ return true if normed !~ /[aeiouy]/i
161
+ end
162
+
163
+ false
164
+ end
165
+
166
+ # One character repeated 5 or more times, or 3 or more times and not an
167
+ # acronym, roman numeral, or www
168
+ def excessive_single_character_repeats?(string, normed)
169
+ return true if normed.chars.uniq.count == 1
170
+
171
+ if string =~ /([^[:space:]i])\1\1/i
172
+ return true if normed =~ /([^0-9])\1\1\1\1/i
173
+
174
+ string.split(/[[:space:][:punct:]]/).each do |word|
175
+ return true if word =~ /([^iw0-9])\1\1/i && word != word.upcase
176
+ end
177
+ end
178
+
179
+ false
180
+ end
181
+
182
+ def three_plus_chars_repeat_twice?(string)
183
+ # At least 3 characters repeated at least twice in a row (but only on short
184
+ # strings, otherwise there are false positives)
185
+ string.length < 80 && string =~ /(....*)[[:space:][:punct:]]*\1[[:space:][:punct:]]*\1/
186
+ end
187
+
188
+ def asdf_row_and_suspicious?(string)
189
+ # All characters from the same row of the keyboard is suspicious, but we
190
+ # need additional confirmation
191
+ if string.chars.all? { |c| MASH_CHARS.include?(c) }
192
+ return true if string.length >= 16
193
+ return true if string =~ /(...).*\1/ # Three-plus characters, repeated
194
+ return true if string =~ /(..).*\1.*\1/ # Two characters, repeated twice
195
+ return true if string =~ /\b[sdfghjkl]\b/ # Stray lowercase letter
196
+ return true if string =~ /[^aeiouy]{3}/i && (string.length > 5 || string != string.upcase) # Three consonants in a row, non-acronym
197
+ end
198
+
199
+ false
200
+ end
201
+
202
+ def too_few_alphanumeric_chars?(normed, min_alnum_chars)
203
+ # Too short (unless we're dealing with a large alphabet with legitimate
204
+ # single-char words)
205
+ if normed.length < min_alnum_chars
206
+ unless normed =~ /\p{Han}|\p{Hangul}|\p{Hiragana}|\p{Katakana}/
207
+ return true
208
+ end
209
+ end
210
+
211
+ false
212
+ end
213
+
214
+ def starts_with_disallowed_punctuation?(string)
215
+ # Starting punctuation, except opening parens or quote
216
+ string =~ /\A[[:punct:]]/ && string !~ /\A(\p{Pi}|\p{Ps}|['"¿»’]).+/
217
+ end
218
+
219
+ def too_many_short_words?(string)
220
+ words = string.split
221
+ two_chars = words.select { |w| w.length < 3 }.count
222
+ if two_chars > 2 && two_chars > 0.75 * words.length
223
+ return true
224
+ end
225
+
226
+ false
227
+ end
228
+
229
+ def mashing_probability(bigram)
230
+ if (f = mashing_bigram_frequencies[bigram])
231
+ f
232
+ elsif f =~ /[a-z]{2}/i
233
+ # 26**2 = 676, so 1 in 2k seems a reasonable probability for an arbitrary two-letter bigram given mashing
234
+ 0.0005
235
+ else
236
+ # An arbitrary (non-ASCII) bigram with mashing is slightly more probable than with legit strings
237
+ 1e-6
238
+ end
239
+ end
240
+
241
+ def corpus_probability(bigram)
242
+ corpus_bigram_frequencies[bigram] || 1e-7 # Around the smallest frequency we store for the corpus
243
+ end
244
+
245
+ def corpus_bigram_frequencies
246
+ @corpus_bigram_frequencies ||= YAML.load_file(File.expand_path('../../resources/bigram_frequencies.yml', __FILE__)).freeze
247
+ end
248
+
249
+ def corpus_bigram_magnitude
250
+ @corpus_bigram_magnitude ||= (corpus_bigram_frequencies.values.map{ |v| v**2 }.inject(&:+)) ** 0.5
251
+ end
252
+
253
+ def mashing_bigram_frequencies
254
+ # This is a guess because we don't have a good corpus, but we assume that
255
+ # 50% of mashing bigrams are a neighboring pair on the ASDF row or a duplicate
256
+ # and the rest are evenly distributed among other neighboring pairs or char-
257
+ # plus-space.
258
+ @mashing_bigram_frequencies ||= MASH_BIGRAMS.each_with_object({}) do |bigram, freqs|
259
+ if bigram.first == bigram.last || bigram.chars.all? { |c| c != ' '.freeze && MASH_CHARS.include?(c) }
260
+ freqs[bigram] = 0.5 / (16 + 26)
261
+ else
262
+ freqs[bigram] = 0.5 / (MASH_BIGRAMS.length - 16 - 26)
263
+ end
264
+ end
265
+ end
266
+
267
+ def mashing_bigram_magnitude
268
+ @mashing_bigram_magnitude ||= (mashing_bigram_frequencies.values.map{ |v| v**2 }.inject(&:+)) ** 0.5
269
+ end
270
+ end