toon 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (8) hide show
  1. checksums.yaml +7 -0
  2. data/.ruby-version +1 -0
  3. data/Gemfile +3 -0
  4. data/LICENSE +21 -0
  5. data/README.md +29 -0
  6. data/lib/toon.rb +139 -0
  7. data/toon.gemspec +13 -0
  8. metadata +48 -0
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 407ce9c69d6811c19ca1e44d7438f1057b8865c136afdd28a5339d701e627424
4
+ data.tar.gz: 5868197f08fc85775d5089c87224b839013ce0e83c1727a012ab99b343fbb3c2
5
+ SHA512:
6
+ metadata.gz: 1e01d3e4d0acafeffb61417c545d2b26e94ddeee795478129b8e03b1ca3dbb6d460b5c246b12cc1088f226c765752696c724d12123a71c22053018526785ef8b
7
+ data.tar.gz: e382956bbb6f6e629d3bed01cfb56cabc514222fa1fe7576b2a51989f85587d306a78b37c001ee42dce19d04b7108d5c796d8aa2da3138723c54b22bbd6a2f04
@@ -0,0 +1 @@
1
+ 2.5
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Steve Shreeve
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,29 @@
1
+ # toon
2
+
3
+ `toon` is a Ruby gem that makes it easy to cleanup and format data.
4
+
5
+ ## Example
6
+
7
+ The following code:
8
+
9
+ ```ruby
10
+ require 'toon'
11
+
12
+ rows = [
13
+ %w[ Name DOB ],
14
+ %w[ tom 19710413 ],
15
+ %w[ mIkE 19690918 ],
16
+ ]
17
+
18
+ p toon! rows, <<~""
19
+ Name tune
20
+ DOB to_yyyymmdd_ymd
21
+ ```
22
+
23
+ will produce:
24
+
25
+ ```text
26
+ [["Name", "DOB" ],
27
+ ["Tom" , "04/13/1971"],
28
+ ["Mike", "09/18/1969"]]
29
+ ```
@@ -0,0 +1,139 @@
1
+ class Object
2
+ def blank?
3
+ respond_to?(:empty?) or return !self
4
+ empty? or respond_to?(:strip) && strip.empty?
5
+ end
6
+
7
+ def present?
8
+ !blank?
9
+ end
10
+
11
+ def present(default=nil)
12
+ blank? ? default : self
13
+ end
14
+ alias :if_blank :present
15
+ end
16
+
17
+ $STATE_MAP ||= <<~end.split(/(?:\n| +)/).inject({}) {|h, e| h.store(*e.split(' ', 2)); h}
18
+ AK Alaska LA Louisiana PA Pennsylvania
19
+ AL Alabama MA Massachusetts PR Puerto Rico
20
+ AR Arkansas MD Maryland RI Rhode Island
21
+ AS American Samoa ME Maine SC South Carolina
22
+ AZ Arizona MI Michigan SD South Dakota
23
+ CA California MN Minnesota TN Tennessee
24
+ CO Colorado MO Missouri TX Texas
25
+ CT Connecticut MS Mississippi UT Utah
26
+ DC District of Columbia MT Montana VA Virginia
27
+ DE Delaware NC North Carolina VI Virgin Islands
28
+ FL Florida ND North Dakota VT Vermont
29
+ GA Georgia NE Nebraska WA Washington
30
+ GU Guam NH New Hampshire WI Wisconsin
31
+ HI Hawaii NJ New Jersey WV West Virginia
32
+ IA Iowa NM New Mexico WY Wyoming
33
+ ID Idaho NV Nevada
34
+ IL Illinois NY New York
35
+ IN Indiana OH Ohio
36
+ KS Kansas OK Oklahoma
37
+ KY Kentucky OR Oregon
38
+ end
39
+
40
+ $STATE_ABBREV ||= $STATE_MAP.inject({}) {|h, (k, v)| h[k] = h[v.upcase] = k; h }
41
+
42
+ def toon(str, func=nil, *args, **opts, &code)
43
+ if block_given?
44
+ yield str
45
+ else
46
+ return if str.nil? #!# TOO CRAZY?
47
+ case func
48
+ when nil then str
49
+ when 'to_decimal'
50
+ prec = 2
51
+ if str[/\A\s*\$?\s*([-+])?\s*\$?\s*([-+])?\s*(\d[,\d]*)?(\.\d*)?\s*\z/]
52
+ sign = "#{$1}#{$2}".squeeze.include?("-") ? "-" : ""
53
+ left = $3.blank? ? "0" : $3.delete(",")
54
+ decs = $4.blank? ? nil : $4
55
+ "%.*f" % [prec, "#{sign}#{left}#{decs}".to_f]
56
+ else
57
+ ""
58
+ end
59
+ when 'to_phone'
60
+ return "" if str.blank?
61
+ num = str.to_s.squeeze(' ').strip
62
+ num, ext = num.split(/\s*(?:ext?\.?|x|#|:|,)\s*/i, 2)
63
+ ext.gsub!(/\D+/,'') if ext
64
+ num = num.sub(/\A[^2-9]*/, '').gsub(/\D+/, '')
65
+ if num =~ /\A([2-9][0-8][0-9])([2-9]\d\d)(\d{4})\z/
66
+ num = "(#{$1}) #{$2}-#{$3}"
67
+ num << ", ext. #{ext}" if num && ext
68
+ else
69
+ num = ext = nil
70
+ end
71
+ num
72
+ when 'to_yyyymmdd'
73
+ case str
74
+ when /^((?:19|20)\d{2})(\d{2})(\d{2})$/ then "%s%s%s" % [$1, $2, $3 ] # YYYYMMDD
75
+ when /^(\d{2})(\d{2})((?:19|20)\d{2})$/ then "%s%s%s" % [$3, $1, $2 ] # MMDDYYYY
76
+ when /^(\d{1,2})([-\/.])(\d{1,2})\2(\d{4})$/ then "%s%02d%02d" % [$4, $1.to_i, $3.to_i] # M/D/Y
77
+ when /^(\d{4})([-\/.])(\d{1,2})\2(\d{1,2})$/ then "%s%02d%02d" % [$1, $3.to_i, $4.to_i] # Y/M/D
78
+ when /^(\d{1,2})([-\/.])(\d{1,2})\2(\d{2})$/
79
+ year = $4.to_i
80
+ year += year < (Time.now.year % 100 + 5) ? 2000 : 1900
81
+ "%04d%02d%02d" % [year, $1.to_i, $3.to_i] # M/D/Y
82
+ else ""
83
+ end
84
+ when 'to_yyyymmdd_ymd'
85
+ toon(str, 'to_yyyymmdd') =~ /^(\d{4})(\d{2})(\d{2})$/ ? "#{$2}/#{$3}/#{$1}" : str
86
+ when 'tune'
87
+ o = {}; opts.each {|e| o[e]=true}
88
+ s = str
89
+ s = s.downcase.gsub(/\s\s+/, ' ').strip.gsub(/(?<=^| |[\d[:punct:]])([[[:alpha:]]])/i) { $1.upcase } # general case
90
+ s.gsub!(/\b([a-z])\. ?([bcdfghjklmnpqrstvwxyz])\.?(?=\W|$)/i) { "#$1#$2".upcase } # initials (should this be :name only?)
91
+ s.gsub!(/\b([a-z](?:[a-z&&[^aeiouy]]{1,4}))\b/i) { $1.upcase } # uppercase apparent acronyms
92
+ s.gsub!(/\b([djs]r|us|acct|[ai]nn?|apps|ed|erb|esq|grp|in[cj]|of[cf]|st|up)\.?(?=\W|$)/i) { $1.capitalize } # force camel-case
93
+ s.gsub!(/(^|(?<=\d ))?\b(and|at|as|of|the|in|on|or|for|to|by|de l[ao]s?|del?|(el-)|el|las)($)?\b/i) { ($1 || $3 || $4) ? $2.downcase.capitalize : $2.downcase } # prepositions
94
+ s.gsub!(/\b(mc|mac(?=d[ao][a-k,m-z][a-z]|[fgmpw])|[dol]')([a-z])/i) { $1.capitalize + $2.capitalize } # mixed case (Irish)
95
+ s.gsub!(/\b(ahn|an[gh]|al|art[sz]?|ash|e[dnv]|echt|elms|emms|eng|epps|essl|i[mp]|mrs?|ms|ng|ock|o[hm]|ong|orr|orth|ost|ott|oz|sng|tsz|u[br]|ung)\b/i) { $1.capitalize } # if o[:name] # capitalize
96
+ s.gsub!(/(?<=^| |[[:punct:]])(apt?s?|arch|ave?|bldg|blvd|cr?t|co?mn|drv?|elm|end|f[lt]|hts?|ln|old|pkw?y|plc?|prk|pt|r[dm]|spc|s[qt]r?|srt|street|[nesw])\.?(?=\W|$)/i) { $1.capitalize } # if o[:address] # road features
97
+ s.gsub!(/(1st|2nd|3rd|[\d]th|de l[ao]s)\b/i) { $1.downcase } # ordinal numbers
98
+ s.gsub!(/(?<=^|\d |\b[nesw] |\b[ns][ew] )(d?el|las?|los)\b/i) { $1.capitalize } # uppercase (Spanish)
99
+ s.gsub!(/\b(ca|dba|fbo|ihop|mri|ucla|usa|vru|[ns][ew]|i{1,3}v?)\b/i) { $1.upcase } # force uppercase
100
+ s.gsub!(/\b([-@.\w]+\.(?:com|net|io|org))\b/i) { $1.downcase } # domain names, email (a little bastardized...)
101
+ s.gsub!(/# /, '#') # collapse spaces following a number sign
102
+ s.sub!(/[.,#]+$/, '') # nuke any trailing period, comma, or hash signs
103
+ s.sub!(/\bP\.? ?O\.? ?Box/i, 'PO Box') # PO Boxes
104
+ s
105
+ when 'zip', 'to_zip'
106
+ str =~ /^(\d{5})-?\d{4}?$/ ? $1 : '' # only allow 5-digit zip codes
107
+ when 'state'
108
+ $STATE_ABBREV[str.upcase] || ''
109
+ else
110
+ if str.respond_to?(func)
111
+ str.send(func, *args)
112
+ else
113
+ warn "dude... you gave me the unknown func #{func.inspect}"
114
+ nil
115
+ end
116
+ end
117
+ end
118
+ end
119
+
120
+ def toon!(rows, rules)
121
+ todo = Hash[rules.scan(/^\s*(.*?) +(.*?)(?:\s*#.*)?$/)]
122
+ seen = 0
123
+ diff = 0
124
+ rows.each_with_index do |cols, r|
125
+ seen += 1
126
+ todo.update(Hash[cols.map.with_index {|name, c| [c, [name, todo[name]]]}]) if seen == 1
127
+ cols.each_with_index do |cell, c|
128
+ name, func = todo[c]
129
+ orig = cell
130
+ cell = toon(cell, func) if func && seen > 1
131
+ if cell != orig
132
+ diff += 1
133
+ cols[c] = cell
134
+ end
135
+ end
136
+ end
137
+ # puts "#{diff} changes made" if diff > 0
138
+ rows
139
+ end
@@ -0,0 +1,13 @@
1
+ # encoding: utf-8
2
+
3
+ Gem::Specification.new do |s|
4
+ s.name = "toon"
5
+ s.version = "0.0.1"
6
+ s.author = "Steve Shreeve"
7
+ s.email = "steve.shreeve@gmail.com"
8
+ s.summary = "A Ruby gem that makes it easy to cleanup and format data"
9
+ s.description = "This gem is helpful for ETL or other general data cleaning."
10
+ s.homepage = "https://github.com/shreeve/toon"
11
+ s.license = "MIT"
12
+ s.files = `git ls-files`.split("\n") - %w[.gitignore]
13
+ end
metadata ADDED
@@ -0,0 +1,48 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: toon
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Steve Shreeve
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2020-09-26 00:00:00.000000000 Z
12
+ dependencies: []
13
+ description: This gem is helpful for ETL or other general data cleaning.
14
+ email: steve.shreeve@gmail.com
15
+ executables: []
16
+ extensions: []
17
+ extra_rdoc_files: []
18
+ files:
19
+ - ".ruby-version"
20
+ - Gemfile
21
+ - LICENSE
22
+ - README.md
23
+ - lib/toon.rb
24
+ - toon.gemspec
25
+ homepage: https://github.com/shreeve/toon
26
+ licenses:
27
+ - MIT
28
+ metadata: {}
29
+ post_install_message:
30
+ rdoc_options: []
31
+ require_paths:
32
+ - lib
33
+ required_ruby_version: !ruby/object:Gem::Requirement
34
+ requirements:
35
+ - - ">="
36
+ - !ruby/object:Gem::Version
37
+ version: '0'
38
+ required_rubygems_version: !ruby/object:Gem::Requirement
39
+ requirements:
40
+ - - ">="
41
+ - !ruby/object:Gem::Version
42
+ version: '0'
43
+ requirements: []
44
+ rubygems_version: 3.1.4
45
+ signing_key:
46
+ specification_version: 4
47
+ summary: A Ruby gem that makes it easy to cleanup and format data
48
+ test_files: []