data_cleansing 0.8.0 → 0.9.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/data_cleansing/cleaners.rb +49 -16
- data/lib/data_cleansing/version.rb +1 -1
- data/test/cleaners_test.rb +209 -0
- data/test/test_db.sqlite3 +0 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ba846025b7441eb5a93230b7fbd8ebe2a4d88e3
|
4
|
+
data.tar.gz: 4e209fd6ef57540a8b549d06c314ae4caeddbf59
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7b464ca76d4c40f4621d86a32cd76bd4bc3e71e8b5eed18ac094ae651a8f0be58772a503fa096c6798b081cf3030363973b0d96cfd2cf45d6497e14a5b2717f1
|
7
|
+
data.tar.gz: e6933049c6200cadb6e398e3d2af8bae641534942a201c6ed7b8a47fc991f7a843d7b2d1b6cbc1c00f14d837f2de887ac6011f23c187fe33be3c6199a1e18cdf
|
@@ -45,7 +45,7 @@ module Cleaners
|
|
45
45
|
DataCleansing.register_cleaner(:remove_non_printable, RemoveNonPrintable)
|
46
46
|
|
47
47
|
# Remove HTML Markup
|
48
|
-
module
|
48
|
+
module ReplaceHTMLMarkup
|
49
49
|
HTML_MARKUP = Regexp.compile(/&(amp|quot|gt|lt|apos|nbsp);/in)
|
50
50
|
|
51
51
|
def self.call(string)
|
@@ -53,17 +53,17 @@ module Cleaners
|
|
53
53
|
|
54
54
|
string.gsub!(HTML_MARKUP) do |match|
|
55
55
|
case match.downcase
|
56
|
-
when 'amp' then
|
56
|
+
when '&' then
|
57
57
|
'&'
|
58
|
-
when 'quot' then
|
58
|
+
when '"' then
|
59
59
|
'"'
|
60
|
-
when 'gt' then
|
60
|
+
when '>' then
|
61
61
|
'>'
|
62
|
-
when 'lt' then
|
62
|
+
when '<' then
|
63
63
|
'<'
|
64
|
-
when 'apos' then
|
64
|
+
when ''' then
|
65
65
|
"'"
|
66
|
-
when 'nbsp' then
|
66
|
+
when ' ' then
|
67
67
|
' '
|
68
68
|
else
|
69
69
|
"&#{match};"
|
@@ -71,16 +71,25 @@ module Cleaners
|
|
71
71
|
end || string
|
72
72
|
end
|
73
73
|
end
|
74
|
-
DataCleansing.register_cleaner(:
|
74
|
+
DataCleansing.register_cleaner(:replace_html_markup, ReplaceHTMLMarkup)
|
75
75
|
|
76
|
-
module
|
76
|
+
module UnescapeURI
|
77
77
|
def self.call(string)
|
78
78
|
return string unless string.is_a?(String)
|
79
79
|
|
80
80
|
URI.unescape(string)
|
81
81
|
end
|
82
82
|
end
|
83
|
-
DataCleansing.register_cleaner(:
|
83
|
+
DataCleansing.register_cleaner(:unescape_uri, UnescapeURI)
|
84
|
+
|
85
|
+
module EscapeURI
|
86
|
+
def self.call(string)
|
87
|
+
return string unless string.is_a?(String)
|
88
|
+
|
89
|
+
URI.escape(string)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
DataCleansing.register_cleaner(:escape_uri, EscapeURI)
|
84
93
|
|
85
94
|
# Compress multiple whitespace to a single space
|
86
95
|
module CompressWhitespace
|
@@ -123,15 +132,39 @@ module Cleaners
|
|
123
132
|
end
|
124
133
|
DataCleansing.register_cleaner(:string_to_integer, StringToInteger)
|
125
134
|
|
135
|
+
# Returns [Integer] after removing all non-digit characters, except '.'
|
136
|
+
# Returns nil if no digits are present in the string.
|
137
|
+
module StringToFloat
|
138
|
+
NUMERIC = Regexp.compile(/[^0-9\.]/)
|
139
|
+
|
140
|
+
def self.call(string)
|
141
|
+
return string unless string.is_a?(String)
|
142
|
+
|
143
|
+
# Remove Non-Digit Chars, except for '.'
|
144
|
+
string.gsub!(NUMERIC, '')
|
145
|
+
string.length > 0 ? string.to_f : nil
|
146
|
+
end
|
147
|
+
end
|
148
|
+
DataCleansing.register_cleaner(:string_to_float, StringToFloat)
|
149
|
+
|
126
150
|
# Convert a Date to a Time at the end of day for that date (YYYY-MM-DD 23:59:59)
|
127
151
|
# Ex: 2015-12-31 becomes 2015-12-31 23:59:59
|
128
152
|
# If something other than a Date object is passed in, it just passes through.
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
153
|
+
#
|
154
|
+
# Note: Only works if ActiveSupport is also loaded since it defines Time#end_of_day.
|
155
|
+
module EndOfDay
|
156
|
+
def self.call(datetime)
|
157
|
+
case datetime
|
158
|
+
when String
|
159
|
+
Time.parse(datetime).end_of_day
|
160
|
+
when Date
|
161
|
+
datetime.to_time.end_of_day
|
162
|
+
when Time
|
163
|
+
datetime.end_of_day
|
164
|
+
else
|
165
|
+
datetime
|
166
|
+
end
|
134
167
|
end
|
135
168
|
end
|
136
|
-
DataCleansing.register_cleaner(:
|
169
|
+
DataCleansing.register_cleaner(:end_of_day, EndOfDay)
|
137
170
|
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
require 'active_support/core_ext/time/calculations'
|
3
|
+
|
4
|
+
class CleanersTest < Minitest::Test
|
5
|
+
class User
|
6
|
+
include DataCleansing::Cleanse
|
7
|
+
|
8
|
+
attr_accessor :first_name, :last_name, :address1, :address2,
|
9
|
+
:make_this_upper, :clean_non_word, :clean_non_printable,
|
10
|
+
:clean_html, :clean_from_uri, :clean_to_uri, :clean_whitespace,
|
11
|
+
:clean_digits_only, :clean_to_integer, :clean_to_float, :clean_end_of_day
|
12
|
+
|
13
|
+
cleanse :first_name, :last_name, :address1, :address2, cleaner: :strip
|
14
|
+
cleanse :make_this_upper, cleaner: :upcase
|
15
|
+
cleanse :clean_non_word, cleaner: :remove_non_word
|
16
|
+
cleanse :clean_non_printable, cleaner: :remove_non_printable
|
17
|
+
cleanse :clean_html, cleaner: :replace_html_markup
|
18
|
+
cleanse :clean_from_uri, cleaner: :unescape_uri
|
19
|
+
cleanse :clean_to_uri, cleaner: :escape_uri
|
20
|
+
cleanse :clean_whitespace, cleaner: :compress_whitespace
|
21
|
+
cleanse :clean_digits_only, cleaner: :digits_only
|
22
|
+
cleanse :clean_to_integer, cleaner: :string_to_integer
|
23
|
+
cleanse :clean_to_float, cleaner: :string_to_float
|
24
|
+
cleanse :clean_end_of_day, cleaner: :end_of_day
|
25
|
+
end
|
26
|
+
|
27
|
+
describe 'Cleaners' do
|
28
|
+
it '#strip' do
|
29
|
+
user = User.new
|
30
|
+
user.first_name = ' jack black '
|
31
|
+
user.last_name = " \n \t joe"
|
32
|
+
user.address1 = "joe \n\n \n \t\t "
|
33
|
+
user.address2 = "joe \n\n bloggs \n \t\t "
|
34
|
+
user.cleanse_attributes!
|
35
|
+
assert_equal 'jack black', user.first_name
|
36
|
+
assert_equal 'joe', user.last_name
|
37
|
+
assert_equal 'joe', user.address1
|
38
|
+
assert_equal "joe \n\n bloggs", user.address2
|
39
|
+
end
|
40
|
+
|
41
|
+
it '#upcase' do
|
42
|
+
user = User.new
|
43
|
+
user.make_this_upper = ' jacK blAck '
|
44
|
+
user.cleanse_attributes!
|
45
|
+
assert_equal ' JACK BLACK ', user.make_this_upper
|
46
|
+
end
|
47
|
+
|
48
|
+
it '#remove_non_word' do
|
49
|
+
user = User.new
|
50
|
+
user.clean_non_word = " !@#$%^&*()+=-~`\t\n jacK blAck <>.,/\"':;{][]\|?/\\ "
|
51
|
+
user.cleanse_attributes!
|
52
|
+
assert_equal 'jacKblAck', user.clean_non_word
|
53
|
+
end
|
54
|
+
|
55
|
+
it '#remove_non_printable' do
|
56
|
+
user = User.new
|
57
|
+
user.clean_non_printable = " !@#$%^&*()+=-~`\t\n jacK blAck <>.,/\"':;{][]\|?/\\ "
|
58
|
+
user.cleanse_attributes!
|
59
|
+
assert_equal " !@#$%^&*()+=-~` jacK blAck <>.,/\"':;{][]\|?/\\ ", user.clean_non_printable
|
60
|
+
end
|
61
|
+
|
62
|
+
describe '#clean_html' do
|
63
|
+
it 'cleans "' do
|
64
|
+
user = User.new
|
65
|
+
user.clean_html = 'O"Leary'
|
66
|
+
user.cleanse_attributes!
|
67
|
+
assert_equal 'O"Leary', user.clean_html
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'cleans &' do
|
71
|
+
user = User.new
|
72
|
+
user.clean_html = 'Jim & Candi'
|
73
|
+
user.cleanse_attributes!
|
74
|
+
assert_equal 'Jim & Candi', user.clean_html
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'cleans >' do
|
78
|
+
user = User.new
|
79
|
+
user.clean_html = '2 > 1'
|
80
|
+
user.cleanse_attributes!
|
81
|
+
assert_equal '2 > 1', user.clean_html
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'cleans <' do
|
85
|
+
user = User.new
|
86
|
+
user.clean_html = '1 < 2'
|
87
|
+
user.cleanse_attributes!
|
88
|
+
assert_equal '1 < 2', user.clean_html
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'cleans '' do
|
92
|
+
user = User.new
|
93
|
+
user.clean_html = '1'2'
|
94
|
+
user.cleanse_attributes!
|
95
|
+
assert_equal "1'2", user.clean_html
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'cleans ' do
|
99
|
+
user = User.new
|
100
|
+
user.clean_html = '1 2'
|
101
|
+
user.cleanse_attributes!
|
102
|
+
assert_equal "1 2", user.clean_html
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'cleans &' do
|
106
|
+
user = User.new
|
107
|
+
user.clean_html = 'Mutt & Jeff Inc.'
|
108
|
+
user.cleanse_attributes!
|
109
|
+
assert_equal 'Mutt & Jeff Inc.', user.clean_html
|
110
|
+
end
|
111
|
+
|
112
|
+
it 'does not clean &;' do
|
113
|
+
user = User.new
|
114
|
+
user.clean_html = 'Mutt &; Jeff Inc.'
|
115
|
+
user.cleanse_attributes!
|
116
|
+
assert_equal 'Mutt &; Jeff Inc.', user.clean_html
|
117
|
+
end
|
118
|
+
|
119
|
+
it 'does not clean &blah;' do
|
120
|
+
user = User.new
|
121
|
+
user.clean_html = '1&blah;2'
|
122
|
+
user.cleanse_attributes!
|
123
|
+
assert_equal '1&blah;2', user.clean_html
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
describe '#unescape_uri' do
|
128
|
+
it 'converts %20' do
|
129
|
+
user = User.new
|
130
|
+
user.clean_from_uri = 'Jim%20%20Bob%20'
|
131
|
+
user.cleanse_attributes!
|
132
|
+
assert_equal 'Jim Bob ', user.clean_from_uri
|
133
|
+
end
|
134
|
+
it 'converts %20 only' do
|
135
|
+
user = User.new
|
136
|
+
user.clean_from_uri = '%20'
|
137
|
+
user.cleanse_attributes!
|
138
|
+
assert_equal ' ', user.clean_from_uri
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
describe '#escape_uri' do
|
143
|
+
it 'converts %20' do
|
144
|
+
user = User.new
|
145
|
+
user.clean_to_uri = 'Jim Bob '
|
146
|
+
user.cleanse_attributes!
|
147
|
+
assert_equal 'Jim%20%20Bob%20', user.clean_to_uri
|
148
|
+
end
|
149
|
+
it 'converts %20 only' do
|
150
|
+
user = User.new
|
151
|
+
user.clean_to_uri = ' '
|
152
|
+
user.cleanse_attributes!
|
153
|
+
assert_equal '%20', user.clean_to_uri
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
describe '#compress_whitespace' do
|
158
|
+
it 'compresses multiple spaces' do
|
159
|
+
user = User.new
|
160
|
+
user.clean_whitespace = ' J im B ob '
|
161
|
+
user.cleanse_attributes!
|
162
|
+
assert_equal ' J im B ob ', user.clean_whitespace
|
163
|
+
end
|
164
|
+
|
165
|
+
it 'does not compress single spaces' do
|
166
|
+
user = User.new
|
167
|
+
user.clean_whitespace = ' Jack Black'
|
168
|
+
user.cleanse_attributes!
|
169
|
+
assert_equal ' Jack Black', user.clean_whitespace
|
170
|
+
end
|
171
|
+
|
172
|
+
it 'compresses newlines and tabs' do
|
173
|
+
user = User.new
|
174
|
+
user.clean_whitespace = " \n\n J im B ob \t\n\t "
|
175
|
+
user.cleanse_attributes!
|
176
|
+
assert_equal ' J im B ob ', user.clean_whitespace
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
it '#digits_only' do
|
181
|
+
user = User.new
|
182
|
+
user.clean_digits_only = " 1 !@#$%^&*3()+=-~`\t\n jacK6 blAck <>.,/\"':;8{][]9\|?/\\ "
|
183
|
+
user.cleanse_attributes!
|
184
|
+
assert_equal '13689', user.clean_digits_only
|
185
|
+
end
|
186
|
+
|
187
|
+
it '#string_to_integer' do
|
188
|
+
user = User.new
|
189
|
+
user.clean_to_integer = " 1 !@#$%^&*3()+=-~`\t\n jacK6 blAck <>.,/\"':;8{][]9\|?/\\ "
|
190
|
+
user.cleanse_attributes!
|
191
|
+
assert_equal 136, user.clean_to_integer
|
192
|
+
end
|
193
|
+
|
194
|
+
it '#string_to_float' do
|
195
|
+
user = User.new
|
196
|
+
user.clean_to_float = " 1 !@#$%^&*3()+=-~`\t\n jacK6 blAck <>.,/\"':;8{][]9\|?/\\ "
|
197
|
+
user.cleanse_attributes!
|
198
|
+
assert_equal 136.89, user.clean_to_float
|
199
|
+
end
|
200
|
+
|
201
|
+
it '#date_to_time_at_end_of_day' do
|
202
|
+
user = User.new
|
203
|
+
user.clean_end_of_day = Time.parse('2016-03-03 14:33:44 +0000')
|
204
|
+
user.cleanse_attributes!
|
205
|
+
assert_equal Time.parse('2016-03-03 23:59:59 +0000').to_i, user.clean_end_of_day.to_i
|
206
|
+
end
|
207
|
+
|
208
|
+
end
|
209
|
+
end
|
data/test/test_db.sqlite3
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_cleansing
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Reid Morrison
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- lib/data_cleansing/railtie.rb
|
56
56
|
- lib/data_cleansing/version.rb
|
57
57
|
- test/active_record_test.rb
|
58
|
+
- test/cleaners_test.rb
|
58
59
|
- test/ruby_test.rb
|
59
60
|
- test/test_db.sqlite3
|
60
61
|
- test/test_helper.rb
|
@@ -84,6 +85,7 @@ specification_version: 4
|
|
84
85
|
summary: Data Cleansing framework for Ruby, Rails, Mongoid and MongoMapper.
|
85
86
|
test_files:
|
86
87
|
- test/active_record_test.rb
|
88
|
+
- test/cleaners_test.rb
|
87
89
|
- test/ruby_test.rb
|
88
90
|
- test/test_db.sqlite3
|
89
91
|
- test/test_helper.rb
|