toon 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.ruby-version +1 -0
- data/Gemfile +3 -0
- data/LICENSE +21 -0
- data/README.md +29 -0
- data/lib/toon.rb +139 -0
- data/toon.gemspec +13 -0
- metadata +48 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: 407ce9c69d6811c19ca1e44d7438f1057b8865c136afdd28a5339d701e627424
|
4
|
+
data.tar.gz: 5868197f08fc85775d5089c87224b839013ce0e83c1727a012ab99b343fbb3c2
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1e01d3e4d0acafeffb61417c545d2b26e94ddeee795478129b8e03b1ca3dbb6d460b5c246b12cc1088f226c765752696c724d12123a71c22053018526785ef8b
|
7
|
+
data.tar.gz: e382956bbb6f6e629d3bed01cfb56cabc514222fa1fe7576b2a51989f85587d306a78b37c001ee42dce19d04b7108d5c796d8aa2da3138723c54b22bbd6a2f04
|
data/.ruby-version
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
2.5
|
data/Gemfile
ADDED
data/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
1
|
+
MIT License
|
2
|
+
|
3
|
+
Copyright (c) 2020 Steve Shreeve
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
7
|
+
in the Software without restriction, including without limitation the rights
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
10
|
+
furnished to do so, subject to the following conditions:
|
11
|
+
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
13
|
+
copies or substantial portions of the Software.
|
14
|
+
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21
|
+
SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,29 @@
|
|
1
|
+
# toon
|
2
|
+
|
3
|
+
`toon` is a Ruby gem that makes it easy to cleanup and format data.
|
4
|
+
|
5
|
+
## Example
|
6
|
+
|
7
|
+
The following code:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
require 'toon'
|
11
|
+
|
12
|
+
rows = [
|
13
|
+
%w[ Name DOB ],
|
14
|
+
%w[ tom 19710413 ],
|
15
|
+
%w[ mIkE 19690918 ],
|
16
|
+
]
|
17
|
+
|
18
|
+
p toon! rows, <<~""
|
19
|
+
Name tune
|
20
|
+
DOB to_yyyymmdd_ymd
|
21
|
+
```
|
22
|
+
|
23
|
+
will produce:
|
24
|
+
|
25
|
+
```text
|
26
|
+
[["Name", "DOB" ],
|
27
|
+
["Tom" , "04/13/1971"],
|
28
|
+
["Mike", "09/18/1969"]]
|
29
|
+
```
|
data/lib/toon.rb
ADDED
@@ -0,0 +1,139 @@
|
|
1
|
+
class Object
|
2
|
+
def blank?
|
3
|
+
respond_to?(:empty?) or return !self
|
4
|
+
empty? or respond_to?(:strip) && strip.empty?
|
5
|
+
end
|
6
|
+
|
7
|
+
def present?
|
8
|
+
!blank?
|
9
|
+
end
|
10
|
+
|
11
|
+
def present(default=nil)
|
12
|
+
blank? ? default : self
|
13
|
+
end
|
14
|
+
alias :if_blank :present
|
15
|
+
end
|
16
|
+
|
17
|
+
$STATE_MAP ||= <<~end.split(/(?:\n| +)/).inject({}) {|h, e| h.store(*e.split(' ', 2)); h}
|
18
|
+
AK Alaska LA Louisiana PA Pennsylvania
|
19
|
+
AL Alabama MA Massachusetts PR Puerto Rico
|
20
|
+
AR Arkansas MD Maryland RI Rhode Island
|
21
|
+
AS American Samoa ME Maine SC South Carolina
|
22
|
+
AZ Arizona MI Michigan SD South Dakota
|
23
|
+
CA California MN Minnesota TN Tennessee
|
24
|
+
CO Colorado MO Missouri TX Texas
|
25
|
+
CT Connecticut MS Mississippi UT Utah
|
26
|
+
DC District of Columbia MT Montana VA Virginia
|
27
|
+
DE Delaware NC North Carolina VI Virgin Islands
|
28
|
+
FL Florida ND North Dakota VT Vermont
|
29
|
+
GA Georgia NE Nebraska WA Washington
|
30
|
+
GU Guam NH New Hampshire WI Wisconsin
|
31
|
+
HI Hawaii NJ New Jersey WV West Virginia
|
32
|
+
IA Iowa NM New Mexico WY Wyoming
|
33
|
+
ID Idaho NV Nevada
|
34
|
+
IL Illinois NY New York
|
35
|
+
IN Indiana OH Ohio
|
36
|
+
KS Kansas OK Oklahoma
|
37
|
+
KY Kentucky OR Oregon
|
38
|
+
end
|
39
|
+
|
40
|
+
$STATE_ABBREV ||= $STATE_MAP.inject({}) {|h, (k, v)| h[k] = h[v.upcase] = k; h }
|
41
|
+
|
42
|
+
def toon(str, func=nil, *args, **opts, &code)
|
43
|
+
if block_given?
|
44
|
+
yield str
|
45
|
+
else
|
46
|
+
return if str.nil? #!# TOO CRAZY?
|
47
|
+
case func
|
48
|
+
when nil then str
|
49
|
+
when 'to_decimal'
|
50
|
+
prec = 2
|
51
|
+
if str[/\A\s*\$?\s*([-+])?\s*\$?\s*([-+])?\s*(\d[,\d]*)?(\.\d*)?\s*\z/]
|
52
|
+
sign = "#{$1}#{$2}".squeeze.include?("-") ? "-" : ""
|
53
|
+
left = $3.blank? ? "0" : $3.delete(",")
|
54
|
+
decs = $4.blank? ? nil : $4
|
55
|
+
"%.*f" % [prec, "#{sign}#{left}#{decs}".to_f]
|
56
|
+
else
|
57
|
+
""
|
58
|
+
end
|
59
|
+
when 'to_phone'
|
60
|
+
return "" if str.blank?
|
61
|
+
num = str.to_s.squeeze(' ').strip
|
62
|
+
num, ext = num.split(/\s*(?:ext?\.?|x|#|:|,)\s*/i, 2)
|
63
|
+
ext.gsub!(/\D+/,'') if ext
|
64
|
+
num = num.sub(/\A[^2-9]*/, '').gsub(/\D+/, '')
|
65
|
+
if num =~ /\A([2-9][0-8][0-9])([2-9]\d\d)(\d{4})\z/
|
66
|
+
num = "(#{$1}) #{$2}-#{$3}"
|
67
|
+
num << ", ext. #{ext}" if num && ext
|
68
|
+
else
|
69
|
+
num = ext = nil
|
70
|
+
end
|
71
|
+
num
|
72
|
+
when 'to_yyyymmdd'
|
73
|
+
case str
|
74
|
+
when /^((?:19|20)\d{2})(\d{2})(\d{2})$/ then "%s%s%s" % [$1, $2, $3 ] # YYYYMMDD
|
75
|
+
when /^(\d{2})(\d{2})((?:19|20)\d{2})$/ then "%s%s%s" % [$3, $1, $2 ] # MMDDYYYY
|
76
|
+
when /^(\d{1,2})([-\/.])(\d{1,2})\2(\d{4})$/ then "%s%02d%02d" % [$4, $1.to_i, $3.to_i] # M/D/Y
|
77
|
+
when /^(\d{4})([-\/.])(\d{1,2})\2(\d{1,2})$/ then "%s%02d%02d" % [$1, $3.to_i, $4.to_i] # Y/M/D
|
78
|
+
when /^(\d{1,2})([-\/.])(\d{1,2})\2(\d{2})$/
|
79
|
+
year = $4.to_i
|
80
|
+
year += year < (Time.now.year % 100 + 5) ? 2000 : 1900
|
81
|
+
"%04d%02d%02d" % [year, $1.to_i, $3.to_i] # M/D/Y
|
82
|
+
else ""
|
83
|
+
end
|
84
|
+
when 'to_yyyymmdd_ymd'
|
85
|
+
toon(str, 'to_yyyymmdd') =~ /^(\d{4})(\d{2})(\d{2})$/ ? "#{$2}/#{$3}/#{$1}" : str
|
86
|
+
when 'tune'
|
87
|
+
o = {}; opts.each {|e| o[e]=true}
|
88
|
+
s = str
|
89
|
+
s = s.downcase.gsub(/\s\s+/, ' ').strip.gsub(/(?<=^| |[\d[:punct:]])([[[:alpha:]]])/i) { $1.upcase } # general case
|
90
|
+
s.gsub!(/\b([a-z])\. ?([bcdfghjklmnpqrstvwxyz])\.?(?=\W|$)/i) { "#$1#$2".upcase } # initials (should this be :name only?)
|
91
|
+
s.gsub!(/\b([a-z](?:[a-z&&[^aeiouy]]{1,4}))\b/i) { $1.upcase } # uppercase apparent acronyms
|
92
|
+
s.gsub!(/\b([djs]r|us|acct|[ai]nn?|apps|ed|erb|esq|grp|in[cj]|of[cf]|st|up)\.?(?=\W|$)/i) { $1.capitalize } # force camel-case
|
93
|
+
s.gsub!(/(^|(?<=\d ))?\b(and|at|as|of|the|in|on|or|for|to|by|de l[ao]s?|del?|(el-)|el|las)($)?\b/i) { ($1 || $3 || $4) ? $2.downcase.capitalize : $2.downcase } # prepositions
|
94
|
+
s.gsub!(/\b(mc|mac(?=d[ao][a-k,m-z][a-z]|[fgmpw])|[dol]')([a-z])/i) { $1.capitalize + $2.capitalize } # mixed case (Irish)
|
95
|
+
s.gsub!(/\b(ahn|an[gh]|al|art[sz]?|ash|e[dnv]|echt|elms|emms|eng|epps|essl|i[mp]|mrs?|ms|ng|ock|o[hm]|ong|orr|orth|ost|ott|oz|sng|tsz|u[br]|ung)\b/i) { $1.capitalize } # if o[:name] # capitalize
|
96
|
+
s.gsub!(/(?<=^| |[[:punct:]])(apt?s?|arch|ave?|bldg|blvd|cr?t|co?mn|drv?|elm|end|f[lt]|hts?|ln|old|pkw?y|plc?|prk|pt|r[dm]|spc|s[qt]r?|srt|street|[nesw])\.?(?=\W|$)/i) { $1.capitalize } # if o[:address] # road features
|
97
|
+
s.gsub!(/(1st|2nd|3rd|[\d]th|de l[ao]s)\b/i) { $1.downcase } # ordinal numbers
|
98
|
+
s.gsub!(/(?<=^|\d |\b[nesw] |\b[ns][ew] )(d?el|las?|los)\b/i) { $1.capitalize } # uppercase (Spanish)
|
99
|
+
s.gsub!(/\b(ca|dba|fbo|ihop|mri|ucla|usa|vru|[ns][ew]|i{1,3}v?)\b/i) { $1.upcase } # force uppercase
|
100
|
+
s.gsub!(/\b([-@.\w]+\.(?:com|net|io|org))\b/i) { $1.downcase } # domain names, email (a little bastardized...)
|
101
|
+
s.gsub!(/# /, '#') # collapse spaces following a number sign
|
102
|
+
s.sub!(/[.,#]+$/, '') # nuke any trailing period, comma, or hash signs
|
103
|
+
s.sub!(/\bP\.? ?O\.? ?Box/i, 'PO Box') # PO Boxes
|
104
|
+
s
|
105
|
+
when 'zip', 'to_zip'
|
106
|
+
str =~ /^(\d{5})-?\d{4}?$/ ? $1 : '' # only allow 5-digit zip codes
|
107
|
+
when 'state'
|
108
|
+
$STATE_ABBREV[str.upcase] || ''
|
109
|
+
else
|
110
|
+
if str.respond_to?(func)
|
111
|
+
str.send(func, *args)
|
112
|
+
else
|
113
|
+
warn "dude... you gave me the unknown func #{func.inspect}"
|
114
|
+
nil
|
115
|
+
end
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
119
|
+
|
120
|
+
def toon!(rows, rules)
|
121
|
+
todo = Hash[rules.scan(/^\s*(.*?) +(.*?)(?:\s*#.*)?$/)]
|
122
|
+
seen = 0
|
123
|
+
diff = 0
|
124
|
+
rows.each_with_index do |cols, r|
|
125
|
+
seen += 1
|
126
|
+
todo.update(Hash[cols.map.with_index {|name, c| [c, [name, todo[name]]]}]) if seen == 1
|
127
|
+
cols.each_with_index do |cell, c|
|
128
|
+
name, func = todo[c]
|
129
|
+
orig = cell
|
130
|
+
cell = toon(cell, func) if func && seen > 1
|
131
|
+
if cell != orig
|
132
|
+
diff += 1
|
133
|
+
cols[c] = cell
|
134
|
+
end
|
135
|
+
end
|
136
|
+
end
|
137
|
+
# puts "#{diff} changes made" if diff > 0
|
138
|
+
rows
|
139
|
+
end
|
data/toon.gemspec
ADDED
@@ -0,0 +1,13 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
Gem::Specification.new do |s|
|
4
|
+
s.name = "toon"
|
5
|
+
s.version = "0.0.1"
|
6
|
+
s.author = "Steve Shreeve"
|
7
|
+
s.email = "steve.shreeve@gmail.com"
|
8
|
+
s.summary = "A Ruby gem that makes it easy to cleanup and format data"
|
9
|
+
s.description = "This gem is helpful for ETL or other general data cleaning."
|
10
|
+
s.homepage = "https://github.com/shreeve/toon"
|
11
|
+
s.license = "MIT"
|
12
|
+
s.files = `git ls-files`.split("\n") - %w[.gitignore]
|
13
|
+
end
|
metadata
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: toon
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Steve Shreeve
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2020-09-26 00:00:00.000000000 Z
|
12
|
+
dependencies: []
|
13
|
+
description: This gem is helpful for ETL or other general data cleaning.
|
14
|
+
email: steve.shreeve@gmail.com
|
15
|
+
executables: []
|
16
|
+
extensions: []
|
17
|
+
extra_rdoc_files: []
|
18
|
+
files:
|
19
|
+
- ".ruby-version"
|
20
|
+
- Gemfile
|
21
|
+
- LICENSE
|
22
|
+
- README.md
|
23
|
+
- lib/toon.rb
|
24
|
+
- toon.gemspec
|
25
|
+
homepage: https://github.com/shreeve/toon
|
26
|
+
licenses:
|
27
|
+
- MIT
|
28
|
+
metadata: {}
|
29
|
+
post_install_message:
|
30
|
+
rdoc_options: []
|
31
|
+
require_paths:
|
32
|
+
- lib
|
33
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
34
|
+
requirements:
|
35
|
+
- - ">="
|
36
|
+
- !ruby/object:Gem::Version
|
37
|
+
version: '0'
|
38
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
39
|
+
requirements:
|
40
|
+
- - ">="
|
41
|
+
- !ruby/object:Gem::Version
|
42
|
+
version: '0'
|
43
|
+
requirements: []
|
44
|
+
rubygems_version: 3.1.4
|
45
|
+
signing_key:
|
46
|
+
specification_version: 4
|
47
|
+
summary: A Ruby gem that makes it easy to cleanup and format data
|
48
|
+
test_files: []
|