data_cleansing 0.8.0 → 0.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/data_cleansing/cleaners.rb +49 -16
- data/lib/data_cleansing/version.rb +1 -1
- data/test/cleaners_test.rb +209 -0
- data/test/test_db.sqlite3 +0 -0
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ba846025b7441eb5a93230b7fbd8ebe2a4d88e3
|
4
|
+
data.tar.gz: 4e209fd6ef57540a8b549d06c314ae4caeddbf59
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 7b464ca76d4c40f4621d86a32cd76bd4bc3e71e8b5eed18ac094ae651a8f0be58772a503fa096c6798b081cf3030363973b0d96cfd2cf45d6497e14a5b2717f1
|
7
|
+
data.tar.gz: e6933049c6200cadb6e398e3d2af8bae641534942a201c6ed7b8a47fc991f7a843d7b2d1b6cbc1c00f14d837f2de887ac6011f23c187fe33be3c6199a1e18cdf
|
@@ -45,7 +45,7 @@ module Cleaners
|
|
45
45
|
DataCleansing.register_cleaner(:remove_non_printable, RemoveNonPrintable)
|
46
46
|
|
47
47
|
# Remove HTML Markup
|
48
|
-
module
|
48
|
+
module ReplaceHTMLMarkup
|
49
49
|
HTML_MARKUP = Regexp.compile(/&(amp|quot|gt|lt|apos|nbsp);/in)
|
50
50
|
|
51
51
|
def self.call(string)
|
@@ -53,17 +53,17 @@ module Cleaners
|
|
53
53
|
|
54
54
|
string.gsub!(HTML_MARKUP) do |match|
|
55
55
|
case match.downcase
|
56
|
-
when 'amp' then
|
56
|
+
when '&' then
|
57
57
|
'&'
|
58
|
-
when 'quot' then
|
58
|
+
when '"' then
|
59
59
|
'"'
|
60
|
-
when 'gt' then
|
60
|
+
when '>' then
|
61
61
|
'>'
|
62
|
-
when 'lt' then
|
62
|
+
when '<' then
|
63
63
|
'<'
|
64
|
-
when 'apos' then
|
64
|
+
when ''' then
|
65
65
|
"'"
|
66
|
-
when 'nbsp' then
|
66
|
+
when ' ' then
|
67
67
|
' '
|
68
68
|
else
|
69
69
|
"&#{match};"
|
@@ -71,16 +71,25 @@ module Cleaners
|
|
71
71
|
end || string
|
72
72
|
end
|
73
73
|
end
|
74
|
-
DataCleansing.register_cleaner(:
|
74
|
+
DataCleansing.register_cleaner(:replace_html_markup, ReplaceHTMLMarkup)
|
75
75
|
|
76
|
-
module
|
76
|
+
module UnescapeURI
|
77
77
|
def self.call(string)
|
78
78
|
return string unless string.is_a?(String)
|
79
79
|
|
80
80
|
URI.unescape(string)
|
81
81
|
end
|
82
82
|
end
|
83
|
-
DataCleansing.register_cleaner(:
|
83
|
+
DataCleansing.register_cleaner(:unescape_uri, UnescapeURI)
|
84
|
+
|
85
|
+
module EscapeURI
|
86
|
+
def self.call(string)
|
87
|
+
return string unless string.is_a?(String)
|
88
|
+
|
89
|
+
URI.escape(string)
|
90
|
+
end
|
91
|
+
end
|
92
|
+
DataCleansing.register_cleaner(:escape_uri, EscapeURI)
|
84
93
|
|
85
94
|
# Compress multiple whitespace to a single space
|
86
95
|
module CompressWhitespace
|
@@ -123,15 +132,39 @@ module Cleaners
|
|
123
132
|
end
|
124
133
|
DataCleansing.register_cleaner(:string_to_integer, StringToInteger)
|
125
134
|
|
135
|
+
# Returns [Integer] after removing all non-digit characters, except '.'
|
136
|
+
# Returns nil if no digits are present in the string.
|
137
|
+
module StringToFloat
|
138
|
+
NUMERIC = Regexp.compile(/[^0-9\.]/)
|
139
|
+
|
140
|
+
def self.call(string)
|
141
|
+
return string unless string.is_a?(String)
|
142
|
+
|
143
|
+
# Remove Non-Digit Chars, except for '.'
|
144
|
+
string.gsub!(NUMERIC, '')
|
145
|
+
string.length > 0 ? string.to_f : nil
|
146
|
+
end
|
147
|
+
end
|
148
|
+
DataCleansing.register_cleaner(:string_to_float, StringToFloat)
|
149
|
+
|
126
150
|
# Convert a Date to a Time at the end of day for that date (YYYY-MM-DD 23:59:59)
|
127
151
|
# Ex: 2015-12-31 becomes 2015-12-31 23:59:59
|
128
152
|
# If something other than a Date object is passed in, it just passes through.
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
153
|
+
#
|
154
|
+
# Note: Only works if ActiveSupport is also loaded since it defines Time#end_of_day.
|
155
|
+
module EndOfDay
|
156
|
+
def self.call(datetime)
|
157
|
+
case datetime
|
158
|
+
when String
|
159
|
+
Time.parse(datetime).end_of_day
|
160
|
+
when Date
|
161
|
+
datetime.to_time.end_of_day
|
162
|
+
when Time
|
163
|
+
datetime.end_of_day
|
164
|
+
else
|
165
|
+
datetime
|
166
|
+
end
|
134
167
|
end
|
135
168
|
end
|
136
|
-
DataCleansing.register_cleaner(:
|
169
|
+
DataCleansing.register_cleaner(:end_of_day, EndOfDay)
|
137
170
|
end
|
@@ -0,0 +1,209 @@
|
|
1
|
+
require_relative 'test_helper'
|
2
|
+
require 'active_support/core_ext/time/calculations'
|
3
|
+
|
4
|
+
class CleanersTest < Minitest::Test
|
5
|
+
class User
|
6
|
+
include DataCleansing::Cleanse
|
7
|
+
|
8
|
+
attr_accessor :first_name, :last_name, :address1, :address2,
|
9
|
+
:make_this_upper, :clean_non_word, :clean_non_printable,
|
10
|
+
:clean_html, :clean_from_uri, :clean_to_uri, :clean_whitespace,
|
11
|
+
:clean_digits_only, :clean_to_integer, :clean_to_float, :clean_end_of_day
|
12
|
+
|
13
|
+
cleanse :first_name, :last_name, :address1, :address2, cleaner: :strip
|
14
|
+
cleanse :make_this_upper, cleaner: :upcase
|
15
|
+
cleanse :clean_non_word, cleaner: :remove_non_word
|
16
|
+
cleanse :clean_non_printable, cleaner: :remove_non_printable
|
17
|
+
cleanse :clean_html, cleaner: :replace_html_markup
|
18
|
+
cleanse :clean_from_uri, cleaner: :unescape_uri
|
19
|
+
cleanse :clean_to_uri, cleaner: :escape_uri
|
20
|
+
cleanse :clean_whitespace, cleaner: :compress_whitespace
|
21
|
+
cleanse :clean_digits_only, cleaner: :digits_only
|
22
|
+
cleanse :clean_to_integer, cleaner: :string_to_integer
|
23
|
+
cleanse :clean_to_float, cleaner: :string_to_float
|
24
|
+
cleanse :clean_end_of_day, cleaner: :end_of_day
|
25
|
+
end
|
26
|
+
|
27
|
+
describe 'Cleaners' do
|
28
|
+
it '#strip' do
|
29
|
+
user = User.new
|
30
|
+
user.first_name = ' jack black '
|
31
|
+
user.last_name = " \n \t joe"
|
32
|
+
user.address1 = "joe \n\n \n \t\t "
|
33
|
+
user.address2 = "joe \n\n bloggs \n \t\t "
|
34
|
+
user.cleanse_attributes!
|
35
|
+
assert_equal 'jack black', user.first_name
|
36
|
+
assert_equal 'joe', user.last_name
|
37
|
+
assert_equal 'joe', user.address1
|
38
|
+
assert_equal "joe \n\n bloggs", user.address2
|
39
|
+
end
|
40
|
+
|
41
|
+
it '#upcase' do
|
42
|
+
user = User.new
|
43
|
+
user.make_this_upper = ' jacK blAck '
|
44
|
+
user.cleanse_attributes!
|
45
|
+
assert_equal ' JACK BLACK ', user.make_this_upper
|
46
|
+
end
|
47
|
+
|
48
|
+
it '#remove_non_word' do
|
49
|
+
user = User.new
|
50
|
+
user.clean_non_word = " !@#$%^&*()+=-~`\t\n jacK blAck <>.,/\"':;{][]\|?/\\ "
|
51
|
+
user.cleanse_attributes!
|
52
|
+
assert_equal 'jacKblAck', user.clean_non_word
|
53
|
+
end
|
54
|
+
|
55
|
+
it '#remove_non_printable' do
|
56
|
+
user = User.new
|
57
|
+
user.clean_non_printable = " !@#$%^&*()+=-~`\t\n jacK blAck <>.,/\"':;{][]\|?/\\ "
|
58
|
+
user.cleanse_attributes!
|
59
|
+
assert_equal " !@#$%^&*()+=-~` jacK blAck <>.,/\"':;{][]\|?/\\ ", user.clean_non_printable
|
60
|
+
end
|
61
|
+
|
62
|
+
describe '#clean_html' do
|
63
|
+
it 'cleans "' do
|
64
|
+
user = User.new
|
65
|
+
user.clean_html = 'O"Leary'
|
66
|
+
user.cleanse_attributes!
|
67
|
+
assert_equal 'O"Leary', user.clean_html
|
68
|
+
end
|
69
|
+
|
70
|
+
it 'cleans &' do
|
71
|
+
user = User.new
|
72
|
+
user.clean_html = 'Jim & Candi'
|
73
|
+
user.cleanse_attributes!
|
74
|
+
assert_equal 'Jim & Candi', user.clean_html
|
75
|
+
end
|
76
|
+
|
77
|
+
it 'cleans >' do
|
78
|
+
user = User.new
|
79
|
+
user.clean_html = '2 > 1'
|
80
|
+
user.cleanse_attributes!
|
81
|
+
assert_equal '2 > 1', user.clean_html
|
82
|
+
end
|
83
|
+
|
84
|
+
it 'cleans <' do
|
85
|
+
user = User.new
|
86
|
+
user.clean_html = '1 < 2'
|
87
|
+
user.cleanse_attributes!
|
88
|
+
assert_equal '1 < 2', user.clean_html
|
89
|
+
end
|
90
|
+
|
91
|
+
it 'cleans '' do
|
92
|
+
user = User.new
|
93
|
+
user.clean_html = '1'2'
|
94
|
+
user.cleanse_attributes!
|
95
|
+
assert_equal "1'2", user.clean_html
|
96
|
+
end
|
97
|
+
|
98
|
+
it 'cleans ' do
|
99
|
+
user = User.new
|
100
|
+
user.clean_html = '1 2'
|
101
|
+
user.cleanse_attributes!
|
102
|
+
assert_equal "1 2", user.clean_html
|
103
|
+
end
|
104
|
+
|
105
|
+
it 'cleans &' do
|
106
|
+
user = User.new
|
107
|
+
user.clean_html = 'Mutt & Jeff Inc.'
|
108
|
+
user.cleanse_attributes!
|
109
|
+
assert_equal 'Mutt & Jeff Inc.', user.clean_html
|
110
|
+
end
|
111
|
+
|
112
|
+
it 'does not clean &;' do
|
113
|
+
user = User.new
|
114
|
+
user.clean_html = 'Mutt &; Jeff Inc.'
|
115
|
+
user.cleanse_attributes!
|
116
|
+
assert_equal 'Mutt &; Jeff Inc.', user.clean_html
|
117
|
+
end
|
118
|
+
|
119
|
+
it 'does not clean &blah;' do
|
120
|
+
user = User.new
|
121
|
+
user.clean_html = '1&blah;2'
|
122
|
+
user.cleanse_attributes!
|
123
|
+
assert_equal '1&blah;2', user.clean_html
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
describe '#unescape_uri' do
|
128
|
+
it 'converts %20' do
|
129
|
+
user = User.new
|
130
|
+
user.clean_from_uri = 'Jim%20%20Bob%20'
|
131
|
+
user.cleanse_attributes!
|
132
|
+
assert_equal 'Jim Bob ', user.clean_from_uri
|
133
|
+
end
|
134
|
+
it 'converts %20 only' do
|
135
|
+
user = User.new
|
136
|
+
user.clean_from_uri = '%20'
|
137
|
+
user.cleanse_attributes!
|
138
|
+
assert_equal ' ', user.clean_from_uri
|
139
|
+
end
|
140
|
+
end
|
141
|
+
|
142
|
+
describe '#escape_uri' do
|
143
|
+
it 'converts %20' do
|
144
|
+
user = User.new
|
145
|
+
user.clean_to_uri = 'Jim Bob '
|
146
|
+
user.cleanse_attributes!
|
147
|
+
assert_equal 'Jim%20%20Bob%20', user.clean_to_uri
|
148
|
+
end
|
149
|
+
it 'converts %20 only' do
|
150
|
+
user = User.new
|
151
|
+
user.clean_to_uri = ' '
|
152
|
+
user.cleanse_attributes!
|
153
|
+
assert_equal '%20', user.clean_to_uri
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
describe '#compress_whitespace' do
|
158
|
+
it 'compresses multiple spaces' do
|
159
|
+
user = User.new
|
160
|
+
user.clean_whitespace = ' J im B ob '
|
161
|
+
user.cleanse_attributes!
|
162
|
+
assert_equal ' J im B ob ', user.clean_whitespace
|
163
|
+
end
|
164
|
+
|
165
|
+
it 'does not compress single spaces' do
|
166
|
+
user = User.new
|
167
|
+
user.clean_whitespace = ' Jack Black'
|
168
|
+
user.cleanse_attributes!
|
169
|
+
assert_equal ' Jack Black', user.clean_whitespace
|
170
|
+
end
|
171
|
+
|
172
|
+
it 'compresses newlines and tabs' do
|
173
|
+
user = User.new
|
174
|
+
user.clean_whitespace = " \n\n J im B ob \t\n\t "
|
175
|
+
user.cleanse_attributes!
|
176
|
+
assert_equal ' J im B ob ', user.clean_whitespace
|
177
|
+
end
|
178
|
+
end
|
179
|
+
|
180
|
+
it '#digits_only' do
|
181
|
+
user = User.new
|
182
|
+
user.clean_digits_only = " 1 !@#$%^&*3()+=-~`\t\n jacK6 blAck <>.,/\"':;8{][]9\|?/\\ "
|
183
|
+
user.cleanse_attributes!
|
184
|
+
assert_equal '13689', user.clean_digits_only
|
185
|
+
end
|
186
|
+
|
187
|
+
it '#string_to_integer' do
|
188
|
+
user = User.new
|
189
|
+
user.clean_to_integer = " 1 !@#$%^&*3()+=-~`\t\n jacK6 blAck <>.,/\"':;8{][]9\|?/\\ "
|
190
|
+
user.cleanse_attributes!
|
191
|
+
assert_equal 136, user.clean_to_integer
|
192
|
+
end
|
193
|
+
|
194
|
+
it '#string_to_float' do
|
195
|
+
user = User.new
|
196
|
+
user.clean_to_float = " 1 !@#$%^&*3()+=-~`\t\n jacK6 blAck <>.,/\"':;8{][]9\|?/\\ "
|
197
|
+
user.cleanse_attributes!
|
198
|
+
assert_equal 136.89, user.clean_to_float
|
199
|
+
end
|
200
|
+
|
201
|
+
it '#date_to_time_at_end_of_day' do
|
202
|
+
user = User.new
|
203
|
+
user.clean_end_of_day = Time.parse('2016-03-03 14:33:44 +0000')
|
204
|
+
user.cleanse_attributes!
|
205
|
+
assert_equal Time.parse('2016-03-03 23:59:59 +0000').to_i, user.clean_end_of_day.to_i
|
206
|
+
end
|
207
|
+
|
208
|
+
end
|
209
|
+
end
|
data/test/test_db.sqlite3
CHANGED
Binary file
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: data_cleansing
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.9.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Reid Morrison
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-03-
|
11
|
+
date: 2016-03-03 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: concurrent-ruby
|
@@ -55,6 +55,7 @@ files:
|
|
55
55
|
- lib/data_cleansing/railtie.rb
|
56
56
|
- lib/data_cleansing/version.rb
|
57
57
|
- test/active_record_test.rb
|
58
|
+
- test/cleaners_test.rb
|
58
59
|
- test/ruby_test.rb
|
59
60
|
- test/test_db.sqlite3
|
60
61
|
- test/test_helper.rb
|
@@ -84,6 +85,7 @@ specification_version: 4
|
|
84
85
|
summary: Data Cleansing framework for Ruby, Rails, Mongoid and MongoMapper.
|
85
86
|
test_files:
|
86
87
|
- test/active_record_test.rb
|
88
|
+
- test/cleaners_test.rb
|
87
89
|
- test/ruby_test.rb
|
88
90
|
- test/test_db.sqlite3
|
89
91
|
- test/test_helper.rb
|