toon 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
Files changed (8) hide show
  1. checksums.yaml +7 -0
  2. data/.ruby-version +1 -0
  3. data/Gemfile +3 -0
  4. data/LICENSE +21 -0
  5. data/README.md +29 -0
  6. data/lib/toon.rb +139 -0
  7. data/toon.gemspec +13 -0
  8. metadata +48 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 407ce9c69d6811c19ca1e44d7438f1057b8865c136afdd28a5339d701e627424
4
+ data.tar.gz: 5868197f08fc85775d5089c87224b839013ce0e83c1727a012ab99b343fbb3c2
5
+ SHA512:
6
+ metadata.gz: 1e01d3e4d0acafeffb61417c545d2b26e94ddeee795478129b8e03b1ca3dbb6d460b5c246b12cc1088f226c765752696c724d12123a71c22053018526785ef8b
7
+ data.tar.gz: e382956bbb6f6e629d3bed01cfb56cabc514222fa1fe7576b2a51989f85587d306a78b37c001ee42dce19d04b7108d5c796d8aa2da3138723c54b22bbd6a2f04
@@ -0,0 +1 @@
1
+ 2.5
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Steve Shreeve
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # toon
2
+
3
+ `toon` is a Ruby gem that makes it easy to cleanup and format data.
4
+
5
+ ## Example
6
+
7
+ The following code:
8
+
9
+ ```ruby
10
+ require 'toon'
11
+
12
+ rows = [
13
+ %w[ Name DOB ],
14
+ %w[ tom 19710413 ],
15
+ %w[ mIkE 19690918 ],
16
+ ]
17
+
18
+ p toon! rows, <<~""
19
+ Name tune
20
+ DOB to_yyyymmdd_ymd
21
+ ```
22
+
23
+ will produce:
24
+
25
+ ```text
26
+ [["Name", "DOB" ],
27
+ ["Tom" , "04/13/1971"],
28
+ ["Mike", "09/18/1969"]]
29
+ ```
@@ -0,0 +1,139 @@
1
+ class Object
2
+ def blank?
3
+ respond_to?(:empty?) or return !self
4
+ empty? or respond_to?(:strip) && strip.empty?
5
+ end
6
+
7
+ def present?
8
+ !blank?
9
+ end
10
+
11
+ def present(default=nil)
12
+ blank? ? default : self
13
+ end
14
+ alias :if_blank :present
15
+ end
16
+
17
+ $STATE_MAP ||= <<~end.split(/(?:\n| +)/).inject({}) {|h, e| h.store(*e.split(' ', 2)); h}
18
+ AK Alaska LA Louisiana PA Pennsylvania
19
+ AL Alabama MA Massachusetts PR Puerto Rico
20
+ AR Arkansas MD Maryland RI Rhode Island
21
+ AS American Samoa ME Maine SC South Carolina
22
+ AZ Arizona MI Michigan SD South Dakota
23
+ CA California MN Minnesota TN Tennessee
24
+ CO Colorado MO Missouri TX Texas
25
+ CT Connecticut MS Mississippi UT Utah
26
+ DC District of Columbia MT Montana VA Virginia
27
+ DE Delaware NC North Carolina VI Virgin Islands
28
+ FL Florida ND North Dakota VT Vermont
29
+ GA Georgia NE Nebraska WA Washington
30
+ GU Guam NH New Hampshire WI Wisconsin
31
+ HI Hawaii NJ New Jersey WV West Virginia
32
+ IA Iowa NM New Mexico WY Wyoming
33
+ ID Idaho NV Nevada
34
+ IL Illinois NY New York
35
+ IN Indiana OH Ohio
36
+ KS Kansas OK Oklahoma
37
+ KY Kentucky OR Oregon
38
+ end
39
+
40
+ $STATE_ABBREV ||= $STATE_MAP.inject({}) {|h, (k, v)| h[k] = h[v.upcase] = k; h }
41
+
42
+ def toon(str, func=nil, *args, **opts, &code)
43
+ if block_given?
44
+ yield str
45
+ else
46
+ return if str.nil? #!# TOO CRAZY?
47
+ case func
48
+ when nil then str
49
+ when 'to_decimal'
50
+ prec = 2
51
+ if str[/\A\s*\$?\s*([-+])?\s*\$?\s*([-+])?\s*(\d[,\d]*)?(\.\d*)?\s*\z/]
52
+ sign = "#{$1}#{$2}".squeeze.include?("-") ? "-" : ""
53
+ left = $3.blank? ? "0" : $3.delete(",")
54
+ decs = $4.blank? ? nil : $4
55
+ "%.*f" % [prec, "#{sign}#{left}#{decs}".to_f]
56
+ else
57
+ ""
58
+ end
59
+ when 'to_phone'
60
+ return "" if str.blank?
61
+ num = str.to_s.squeeze(' ').strip
62
+ num, ext = num.split(/\s*(?:ext?\.?|x|#|:|,)\s*/i, 2)
63
+ ext.gsub!(/\D+/,'') if ext
64
+ num = num.sub(/\A[^2-9]*/, '').gsub(/\D+/, '')
65
+ if num =~ /\A([2-9][0-8][0-9])([2-9]\d\d)(\d{4})\z/
66
+ num = "(#{$1}) #{$2}-#{$3}"
67
+ num << ", ext. #{ext}" if num && ext
68
+ else
69
+ num = ext = nil
70
+ end
71
+ num
72
+ when 'to_yyyymmdd'
73
+ case str
74
+ when /^((?:19|20)\d{2})(\d{2})(\d{2})$/ then "%s%s%s" % [$1, $2, $3 ] # YYYYMMDD
75
+ when /^(\d{2})(\d{2})((?:19|20)\d{2})$/ then "%s%s%s" % [$3, $1, $2 ] # MMDDYYYY
76
+ when /^(\d{1,2})([-\/.])(\d{1,2})\2(\d{4})$/ then "%s%02d%02d" % [$4, $1.to_i, $3.to_i] # M/D/Y
77
+ when /^(\d{4})([-\/.])(\d{1,2})\2(\d{1,2})$/ then "%s%02d%02d" % [$1, $3.to_i, $4.to_i] # Y/M/D
78
+ when /^(\d{1,2})([-\/.])(\d{1,2})\2(\d{2})$/
79
+ year = $4.to_i
80
+ year += year < (Time.now.year % 100 + 5) ? 2000 : 1900
81
+ "%04d%02d%02d" % [year, $1.to_i, $3.to_i] # M/D/Y
82
+ else ""
83
+ end
84
+ when 'to_yyyymmdd_ymd'
85
+ toon(str, 'to_yyyymmdd') =~ /^(\d{4})(\d{2})(\d{2})$/ ? "#{$2}/#{$3}/#{$1}" : str
86
+ when 'tune'
87
+ o = {}; opts.each {|e| o[e]=true}
88
+ s = str
89
+ s = s.downcase.gsub(/\s\s+/, ' ').strip.gsub(/(?<=^| |[\d[:punct:]])([[[:alpha:]]])/i) { $1.upcase } # general case
90
+ s.gsub!(/\b([a-z])\. ?([bcdfghjklmnpqrstvwxyz])\.?(?=\W|$)/i) { "#$1#$2".upcase } # initials (should this be :name only?)
91
+ s.gsub!(/\b([a-z](?:[a-z&&[^aeiouy]]{1,4}))\b/i) { $1.upcase } # uppercase apparent acronyms
92
+ s.gsub!(/\b([djs]r|us|acct|[ai]nn?|apps|ed|erb|esq|grp|in[cj]|of[cf]|st|up)\.?(?=\W|$)/i) { $1.capitalize } # force camel-case
93
+ s.gsub!(/(^|(?<=\d ))?\b(and|at|as|of|the|in|on|or|for|to|by|de l[ao]s?|del?|(el-)|el|las)($)?\b/i) { ($1 || $3 || $4) ? $2.downcase.capitalize : $2.downcase } # prepositions
94
+ s.gsub!(/\b(mc|mac(?=d[ao][a-k,m-z][a-z]|[fgmpw])|[dol]')([a-z])/i) { $1.capitalize + $2.capitalize } # mixed case (Irish)
95
+ s.gsub!(/\b(ahn|an[gh]|al|art[sz]?|ash|e[dnv]|echt|elms|emms|eng|epps|essl|i[mp]|mrs?|ms|ng|ock|o[hm]|ong|orr|orth|ost|ott|oz|sng|tsz|u[br]|ung)\b/i) { $1.capitalize } # if o[:name] # capitalize
96
+ s.gsub!(/(?<=^| |[[:punct:]])(apt?s?|arch|ave?|bldg|blvd|cr?t|co?mn|drv?|elm|end|f[lt]|hts?|ln|old|pkw?y|plc?|prk|pt|r[dm]|spc|s[qt]r?|srt|street|[nesw])\.?(?=\W|$)/i) { $1.capitalize } # if o[:address] # road features
97
+ s.gsub!(/(1st|2nd|3rd|[\d]th|de l[ao]s)\b/i) { $1.downcase } # ordinal numbers
98
+ s.gsub!(/(?<=^|\d |\b[nesw] |\b[ns][ew] )(d?el|las?|los)\b/i) { $1.capitalize } # uppercase (Spanish)
99
+ s.gsub!(/\b(ca|dba|fbo|ihop|mri|ucla|usa|vru|[ns][ew]|i{1,3}v?)\b/i) { $1.upcase } # force uppercase
100
+ s.gsub!(/\b([-@.\w]+\.(?:com|net|io|org))\b/i) { $1.downcase } # domain names, email (a little bastardized...)
101
+ s.gsub!(/# /, '#') # collapse spaces following a number sign
102
+ s.sub!(/[.,#]+$/, '') # nuke any trailing period, comma, or hash signs
103
+ s.sub!(/\bP\.? ?O\.? ?Box/i, 'PO Box') # PO Boxes
104
+ s
105
+ when 'zip', 'to_zip'
106
+ str =~ /^(\d{5})-?\d{4}?$/ ? $1 : '' # only allow 5-digit zip codes
107
+ when 'state'
108
+ $STATE_ABBREV[str.upcase] || ''
109
+ else
110
+ if str.respond_to?(func)
111
+ str.send(func, *args)
112
+ else
113
+ warn "dude... you gave me the unknown func #{func.inspect}"
114
+ nil
115
+ end
116
+ end
117
+ end
118
+ end
119
+
120
+ def toon!(rows, rules)
121
+ todo = Hash[rules.scan(/^\s*(.*?) +(.*?)(?:\s*#.*)?$/)]
122
+ seen = 0
123
+ diff = 0
124
+ rows.each_with_index do |cols, r|
125
+ seen += 1
126
+ todo.update(Hash[cols.map.with_index {|name, c| [c, [name, todo[name]]]}]) if seen == 1
127
+ cols.each_with_index do |cell, c|
128
+ name, func = todo[c]
129
+ orig = cell
130
+ cell = toon(cell, func) if func && seen > 1
131
+ if cell != orig
132
+ diff += 1
133
+ cols[c] = cell
134
+ end
135
+ end
136
+ end
137
+ # puts "#{diff} changes made" if diff > 0
138
+ rows
139
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "toon"
5
+ s.version = "0.0.1"
6
+ s.author = "Steve Shreeve"
7
+ s.email = "steve.shreeve@gmail.com"
8
+ s.summary = "A Ruby gem that makes it easy to cleanup and format data"
9
+ s.description = "This gem is helpful for ETL or other general data cleaning."
10
+ s.homepage = "https://github.com/shreeve/toon"
11
+ s.license = "MIT"
12
+ s.files = `git ls-files`.split("\n") - %w[.gitignore]
13
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: toon
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Steve Shreeve
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: This gem is helpful for ETL or other general data cleaning.
14
+ email: steve.shreeve@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - ".ruby-version"
20
+ - Gemfile
21
+ - LICENSE
22
+ - README.md
23
+ - lib/toon.rb
24
+ - toon.gemspec
25
+ homepage: https://github.com/shreeve/toon
26
+ licenses:
27
+ - MIT
28
+ metadata: {}
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubygems_version: 3.1.4
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: A Ruby gem that makes it easy to cleanup and format data
48
+ test_files: []