dejunk 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.gitignore +2 -0
- data/README.md +5 -1
- data/circle.yml +7 -0
- data/lib/dejunk.rb +2 -2
- data/lib/dejunk/version.rb +1 -1
- metadata +4 -4
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: c544a625c931feb25e33682bb9a2c7ad243b57acaab1d1643d8ad410f7cbec69
|
4
|
+
data.tar.gz: dbae0776455be08aa508c2699bdc9c4041d23fc7d627a8fb99a985a05f5c16b7
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 48728d90252fa4b29bb38d165a0920325c1d4bd13a54f70be637fbd36b572df49458446bf4caaaefb5e7289ca1a5ec990b46d917734f10be8e2783edaae7f4ff
|
7
|
+
data.tar.gz: ca738d822697a164101f3e832e94a95cdf98e59180930bb0bd48c467e7999f0ad47da014b13e59ba36ce0d50326f17201eeb1b3a9357ab2469643ae0fd8fc5c4
|
data/.gitignore
CHANGED
data/README.md
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
# Dejunk
|
2
2
|
|
3
|
+
[](https://circleci.com/gh/academia-edu/dejunk)
|
4
|
+
|
3
5
|
Detect keyboard mashing and other junk in your data.
|
4
6
|
|
5
7
|
For example, if you allow user-entered tags, but want to hide bad ones. Or if
|
@@ -51,7 +53,9 @@ $ Dejunk.is_junk?('Hi', whitelist_regexes: [/\Ahi\z/i])
|
|
51
53
|
|
52
54
|
Returns a reason when junk is detected for aid in debugging. Optional parameters
|
53
55
|
are `min_alnum_chars` (defaults to 3), and `whitelist_strings` and
|
54
|
-
`whitelist_regexes` (both default to none
|
56
|
+
`whitelist_regexes` (both default to none, but you'll likely want some domain-specific
|
57
|
+
strings here, which you might discover by checking against a sample from your existing
|
58
|
+
corpus).
|
55
59
|
|
56
60
|
## Development
|
57
61
|
|
data/circle.yml
ADDED
data/lib/dejunk.rb
CHANGED
@@ -131,11 +131,11 @@ module Dejunk
|
|
131
131
|
return 0 unless bigrams.present?
|
132
132
|
|
133
133
|
prob_bigrams_given_mashing = bigrams.
|
134
|
-
map { |bigram| BigDecimal
|
134
|
+
map { |bigram| BigDecimal(mashing_probability(bigram).to_s) }.
|
135
135
|
inject(&:*)
|
136
136
|
|
137
137
|
prob_bigrams_given_corpus = bigrams.
|
138
|
-
map { |bigram| BigDecimal
|
138
|
+
map { |bigram| BigDecimal(corpus_probability(bigram).to_s) }.
|
139
139
|
inject(&:*)
|
140
140
|
|
141
141
|
numerator = prob_bigrams_given_mashing * apriori_probability_of_mashing
|
data/lib/dejunk/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: dejunk
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- David Judd
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2019-04-09 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -82,6 +82,7 @@ files:
|
|
82
82
|
- Rakefile
|
83
83
|
- bin/console
|
84
84
|
- bin/setup
|
85
|
+
- circle.yml
|
85
86
|
- dejunk.gemspec
|
86
87
|
- lib/dejunk.rb
|
87
88
|
- lib/dejunk/version.rb
|
@@ -104,8 +105,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
104
105
|
- !ruby/object:Gem::Version
|
105
106
|
version: '0'
|
106
107
|
requirements: []
|
107
|
-
|
108
|
-
rubygems_version: 2.4.5.1
|
108
|
+
rubygems_version: 3.0.1
|
109
109
|
signing_key:
|
110
110
|
specification_version: 4
|
111
111
|
summary: Detect keyboard mashing and other junk in your data.
|