normalic 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,131 +1,37 @@
1
- #only handles U.S addresses
2
- require 'constants'
1
+ require File.expand_path('./normalic/uri', File.dirname(__FILE__))
2
+ require File.expand_path('./normalic/phone_number', File.dirname(__FILE__))
3
+ require File.expand_path('./normalic/address', File.dirname(__FILE__))
3
4
 
4
5
  module Normalic
5
- class Address
6
+ private
6
7
 
7
- attr_accessor :number, :direction, :street, :type, :city, :state, :zipcode
8
-
9
- def initialize(fields={})
10
- @number = fields[:number]
11
- @direction = fields[:direction]
12
- @street = fields[:street]
13
- @type = fields[:type]
14
- @city = fields[:city]
15
- @state = fields[:state]
16
- @zipcode = fields[:zipcode]
17
- end
18
-
19
- def self.titlize(str)
20
- if str
21
- str.gsub(/\w+/){|w| w.capitalize}
22
- else
23
- nil
24
- end
25
- end
26
-
27
- def [](field_name)
28
- begin
29
- self.send(field_name.to_s)
30
- rescue NoMethodError => e
31
- nil
32
- end
33
- end
34
-
35
- def []=(field_name, value)
36
- begin
37
- self.send("#{field_name}=", value)
38
- rescue NoMethodError => e
39
- nil
40
- end
8
+ String.class_eval do
9
+ def detoken!(regex)
10
+ regex_p = Regexp.new('(\W+|\A)(' + regex.source + ')$', regex.options)
11
+ oldself = self.clone
12
+ self.cut!(regex_p) ? oldself.match(regex_p)[2] : nil
41
13
  end
42
14
 
43
- def to_s
44
- #"#{line1},#{" #{city.gsub(/\w+/){|w| w.capitalize}}," if city}#{" #{state.upcase}" if state}#{" " + zipcode if zipcode}".strip
45
- "#{line1}#{", #{city}" if city}#{", #{state}" if state}#{" " + zipcode if zipcode}".strip
46
- #"#{line1}, #{city}, #{state} #{zipcode}"
15
+ def detoken_rstrip!(regex)
16
+ regex_p = Regexp.new('.*((\W|\A)(' + regex.source + ')(\W.*|\Z))', regex.options)
17
+ oldself = self.clone
18
+ self.cut!(regex_p, 1) ? oldself.match(regex_p)[3] : nil
47
19
  end
48
20
 
49
- def line1
50
- #"#{number}#{" " + direction.upcase if direction}#{" " + street.gsub(/\w+/){|w| w.capitalize} if street}#{" " + type.capitalize if type}".strip
51
- "#{number}#{" " + direction if direction}#{" " + street if street}#{" " + type if type}"
21
+ def detoken_front!(regex)
22
+ regex_p = Regexp.new('^(' + regex.source + ')(\W+|\Z)', regex.options)
23
+ oldself = self.clone
24
+ self.cut!(regex_p) ? oldself.match(regex_p)[1] : nil
52
25
  end
53
26
 
54
- #Iteratively take chunks off of the string.
55
- def self.parse(address)
56
- address.strip!
57
- regex = {
58
- :unit => /(((\#?\w*)?\W*(su?i?te|p\W*[om]\W*b(?:ox)?|dept|department|ro*m|floor|fl|apt|apartment|unit|box))$)|(\W((su?i?te|p\W*[om]\W*b(?:ox)?|dept|department|ro*m|floor|fl|apt|apartment|unit|box)\W*(\#?\w*)?)\W{0,3}$)/i,
59
- :direct => Regexp.new(Directional.keys * '|' + '|' + Directional.values * '\.?|',Regexp::IGNORECASE),
60
- :type => Regexp.new('(' + StreetTypes_list * '|' + ')\\W*?$',Regexp::IGNORECASE),
61
- :number => /\d+-?\d*/,
62
- :fraction => /\d+\/\d+/,
63
- :country => /\W+USA$/,
64
- :zipcode => /\W+(\d{5}|\d{5}-\d{4})$/,
65
- :state => Regexp.new('\W+(' + StateCodes.values * '|' + '|' + StateCodes.keys * '|' + ')$',Regexp::IGNORECASE),
66
- }
67
- regex[:street] = Regexp.new('((' + regex[:direct].source + ')\\W)?\\W*(.*)\\W*(' + regex[:type].source + ')?', Regexp::IGNORECASE)
68
-
69
- #get rid of USA at the end
70
- country_code = address[regex[:country]]
71
- address.gsub!(regex[:country], "")
72
- zipcode = address[regex[:zipcode]]
73
- address.gsub!(regex[:zipcode], "")
74
- zipcode.gsub!(/\W/, "") if zipcode
75
-
76
- state = address[regex[:state]]
77
- address.gsub!(regex[:state], "")
78
- state.gsub!(/(^\W*|\W*$)/, "").downcase! if state
79
- state = StateCodes[state] || state
80
-
81
- if ZipCityMap[zipcode]
82
- regex[:city] = Regexp.new("\\W+" + ZipCityMap[zipcode] + "$", Regexp::IGNORECASE)
83
- regex[:city] = /,.*$/ if !address[regex[:city]]
84
- city = ZipCityMap[zipcode]
27
+ def cut!(regex, match_index=0)
28
+ if match = self.match(regex)
29
+ i1, i2 = match.offset(match_index)
30
+ self[i1...i2] = ''
31
+ match[match_index]
85
32
  else
86
- regex[:city] = /,.*$/
87
- city = address[regex[:city]]
88
- city.gsub!(/(^\W*|\W*$)/, "").downcase! if city
89
- end
90
-
91
- address.gsub!(regex[:city], "")
92
- address.gsub!(regex[:unit], "")
93
- address.gsub!(Regexp.new('\W(' + regex[:direct].source + ')\\W{0,3}$', Regexp::IGNORECASE), "")
94
-
95
- type = address[regex[:type]]
96
- address.gsub!(regex[:type], "")
97
- type.gsub!(/(^\W*|\W*$)/, "").downcase! if type
98
- type = StreetTypes[type] || type if type
99
-
100
- if address =~ /(\Wand\W|\W\&\W)/
101
- #intersections. print as is
102
- address.gsub!(/(\Wand\W|\W\&\W)/, " and ")
103
- arr = ["", address, "", ""]
104
- else
105
- regex[:address] = Regexp.new('^\W*(' + regex[:number].source + '\\W)?\W*(?:' + regex[:fraction].source + '\W*)?' + regex[:street].source, Regexp::IGNORECASE)
106
- arr = regex[:address].match(address).to_a
107
- end
108
-
109
- number = arr[1].strip if arr[1]
110
- if arr[2] && (!arr[4] || arr[4].empty?)
111
- street = arr[2].strip.downcase
112
- else
113
- dir = Directional[arr[2].strip.downcase] || arr[2].strip.downcase if arr[2]
114
- dir.gsub!(/\W/, "") if dir
33
+ nil
115
34
  end
116
- street = arr[4].strip.downcase if arr[4] && !street
117
-
118
- self.new(
119
- {
120
- :number => number,
121
- :direction => dir ? dir.upcase : nil,
122
- :street => titlize(street),
123
- :type => titlize(type),
124
- :city => titlize(city),
125
- :state => state ? state.upcase : nil,
126
- :zipcode => zipcode
127
- }
128
- )
129
35
  end
130
36
  end
131
37
  end
@@ -0,0 +1,236 @@
1
+ require File.expand_path('../constants', File.dirname(__FILE__))
2
+
3
+ module Normalic
4
+ # only handles U.S. addresses
5
+ class Address
6
+ UNIT_TYPE_REGEX = /ap(artmen)?t|box|building|bldg|dep(artmen)?t|fl(oor)?|po( box)?|r(oo)?m|s(ui)?te|un(i)?t/
7
+ REGEXES = {:country => /usa/,
8
+ :zipcode => /\d{5}(-\d{4})?/,
9
+ :state => Regexp.new(STATE_CODES.values * '|' + '|' +
10
+ STATE_CODES.keys * '|'),
11
+ :city => /\w+(\s\w+)*/,
12
+ :unit => Regexp.new('(#\w+)|' +
13
+ '(#?\w+\W+(' + UNIT_TYPE_REGEX.source + '))|' +
14
+ '((' + UNIT_TYPE_REGEX.source + ')\W+#?\w+)'),
15
+ :directional => Regexp.new(DIRECTIONAL.keys * '|' + '|' +
16
+ DIRECTIONAL.values * '|'),
17
+ :type => Regexp.new(STREET_TYPES_LIST * '|'),
18
+ :number => /\d+/,
19
+ :street => /\w+(\s\w+)*/,
20
+ :intersection => /(.+)\W+(and|&)\W+(.+)/}
21
+
22
+ attr_accessor :number, :direction, :street, :type, :city, :state, :zipcode, :intersection
23
+
24
+ def initialize(fields={})
25
+ @number = fields[:number]
26
+ @direction = fields[:direction]
27
+ @street = fields[:street]
28
+ @type = fields[:type]
29
+ @city = fields[:city]
30
+ @state = fields[:state]
31
+ @zipcode = fields[:zipcode]
32
+ @intersection = fields[:intersection] || false
33
+ end
34
+
35
+ def self.parse(raw)
36
+ address = raw.to_s
37
+ clean = self.clean(address)
38
+ tokens = self.tokenize(clean)
39
+ normd = self.normalize(tokens)
40
+
41
+ self.new(normd)
42
+ end
43
+
44
+ def [](field_name)
45
+ begin
46
+ self.send(field_name.to_s)
47
+ rescue NoMethodError => e
48
+ nil
49
+ end
50
+ end
51
+
52
+ def []=(field_name, value)
53
+ begin
54
+ self.send("#{field_name}=", value)
55
+ rescue NoMethodError => e
56
+ nil
57
+ end
58
+ end
59
+
60
+ def to_s
61
+ parts = [line1, city, state].select {|e| e ? true : false}
62
+ parts.join(', ') + (zipcode ? ' ' + zipcode : '')
63
+ end
64
+
65
+ def line1
66
+ if intersection
67
+ parts1 = [direction[0], street[0], type[0]].select {|e| e ? true : false}
68
+ parts2 = [direction[0], street[0], type[0]].select {|e| e ? true : false}
69
+ parts1.join(' ') + " and " + parts2.join(' ')
70
+ else
71
+ parts = [number, direction, street, type].select {|e| e ? true : false}
72
+ parts.join(' ')
73
+ end
74
+ end
75
+
76
+ def ==(other)
77
+ self.to_s == other.to_s ? true : false
78
+ end
79
+
80
+ def match_essential?(other)
81
+ return false unless zipcode == other.zipcode
82
+ return false unless state == other.state
83
+ return false unless city == other.city
84
+ return false unless street == other.street
85
+ return false unless number == other.number
86
+ return false unless !type || !other.type ||
87
+ type == other.type
88
+ return false unless !direction || !other.direction ||
89
+ direction == other.direction
90
+ true
91
+ end
92
+
93
+ private
94
+
95
+ def self.titlize(str)
96
+ if str
97
+ str.gsub(/\w+/){|w| w.capitalize}
98
+ else
99
+ nil
100
+ end
101
+ end
102
+
103
+ def self.clean(address)
104
+ address = address.clone
105
+
106
+ address.downcase!
107
+ address.gsub!("\n",', ')
108
+ address.strip!
109
+ address.gsub!(/\s+/,' ')
110
+ address.gsub!('.', '')
111
+
112
+ address
113
+ end
114
+
115
+ def self.tokenize(address)
116
+ address = address.clone
117
+
118
+ address.detoken!(REGEXES[:country])
119
+ zipcode = address.detoken!(REGEXES[:zipcode])
120
+
121
+ state = address.detoken!(REGEXES[:state])
122
+
123
+ if zipcode && ZIP_CITY_MAP[zipcode] &&
124
+ (zipcity = ZIP_CITY_MAP[zipcode][:city])
125
+ city = address.detoken!(Regexp.new(zipcity))
126
+ end
127
+ unless city
128
+ city = address.cut!(Regexp.new('\W*,\W+(' + REGEXES[:city].source +
129
+ ')\W*$'))
130
+ city = city.cut!(REGEXES[:city]) if city
131
+ end
132
+
133
+ address.detoken_rstrip!(REGEXES[:unit])
134
+
135
+ if m = address.match(REGEXES[:intersection])
136
+ intersection = true
137
+ t1, s1, d1 = self.tokenize_street(m[1], false)
138
+ t2, s2, d2 = self.tokenize_street(m[3], false)
139
+ type = [t1, t2]
140
+ street = [s1, s2]
141
+ direction = [d1, d2]
142
+ number = nil
143
+ else
144
+ intersection = false
145
+ type, street, direction, number = self.tokenize_street(address)
146
+ end
147
+
148
+ {:zipcode => zipcode,
149
+ :state => state,
150
+ :city => city,
151
+ :type => type,
152
+ :street => street,
153
+ :direction => direction,
154
+ :number => number,
155
+ :intersection => intersection}
156
+ end
157
+
158
+ def self.tokenize_street(address, has_number=true)
159
+ address = address.clone
160
+
161
+ number = has_number ? address.detoken_front!(REGEXES[:number]) : nil
162
+ direction = address.detoken_front!(REGEXES[:directional]) ||
163
+ address.detoken_rstrip!(REGEXES[:directional])
164
+ type = address.detoken_rstrip!(REGEXES[:type])
165
+ street = address.detoken!(REGEXES[:street])
166
+ if has_number
167
+ return type, street, direction, number
168
+ else
169
+ return type, street, direction
170
+ end
171
+ end
172
+
173
+ def self.normalize(tokens)
174
+ tokens = tokens.clone
175
+
176
+ tokens[:zipcode] = self.normalize_zipcode(tokens[:zipcode])
177
+ tokens[:state] = self.normalize_state(tokens[:state], tokens[:zipcode])
178
+ tokens[:city] = self.normalize_city(tokens[:city], tokens[:zipcode])
179
+
180
+ if tokens[:intersection]
181
+ tokens[:type].collect! {|t| self.normalize_type(t)}
182
+ tokens[:street].collect! {|s| self.normalize_street(s)}
183
+ tokens[:direction].collect! {|d| self.normalize_direction(d)}
184
+ else
185
+ tokens[:type] = self.normalize_type(tokens[:type])
186
+ tokens[:street] = self.normalize_street(tokens[:street])
187
+ tokens[:direction] = self.normalize_direction(tokens[:direction])
188
+ end
189
+
190
+ tokens
191
+ end
192
+
193
+ def self.normalize_zipcode(zipcode)
194
+ zipcode ? zipcode[0,5] : nil
195
+ end
196
+
197
+ def self.normalize_state(state, zipcode=nil)
198
+ if zipcode && ZIP_CITY_MAP[zipcode]
199
+ state = ZIP_CITY_MAP[zipcode][:state]
200
+ state.upcase
201
+ elsif state
202
+ state = STATE_CODES[state] || state
203
+ state.upcase
204
+ else
205
+ nil
206
+ end
207
+ end
208
+
209
+ def self.normalize_city(city, zipcode=nil)
210
+ city = ZIP_CITY_MAP[zipcode][:city] if zipcode && ZIP_CITY_MAP[zipcode]
211
+ city ? self.titlize(city) : nil
212
+ end
213
+
214
+ def self.normalize_type(type)
215
+ if type
216
+ type = STREET_TYPES[type] || type
217
+ self.titlize(type) + '.'
218
+ else
219
+ nil
220
+ end
221
+ end
222
+
223
+ def self.normalize_street(street)
224
+ street ? self.titlize(street) : nil
225
+ end
226
+
227
+ def self.normalize_direction(direction)
228
+ if direction
229
+ direction = DIRECTIONAL[direction] || direction
230
+ direction.upcase
231
+ else
232
+ nil
233
+ end
234
+ end
235
+ end
236
+ end
@@ -0,0 +1,49 @@
1
+ module Normalic
2
+ # only handles U.S. phone numbers
3
+ class PhoneNumber
4
+ attr_accessor :npa, :nxx, :slid
5
+
6
+ def initialize(fields={})
7
+ @npa = fields[:npa]
8
+ @nxx = fields[:nxx]
9
+ @slid = fields[:slid]
10
+ end
11
+
12
+ def self.parse(raw)
13
+ digs = raw.to_s.gsub(/[^\d]/,'')
14
+ while digs != (trim = digs.gsub(/^[01]/,''))
15
+ digs = trim
16
+ end
17
+ if digs.length < 10
18
+ return nil
19
+ end
20
+ self.new(:npa => digs[0,3],
21
+ :nxx => digs[3,3],
22
+ :slid => digs[6,4])
23
+ end
24
+
25
+ def to_s
26
+ "#{npa} #{nxx} #{slid}"
27
+ end
28
+
29
+ def [](field_name)
30
+ begin
31
+ self.send(field_name.to_s)
32
+ rescue NoMethodError => e
33
+ nil
34
+ end
35
+ end
36
+
37
+ def []=(field_name, value)
38
+ begin
39
+ self.send("#{field_name}=", value)
40
+ rescue NoMethodError => e
41
+ nil
42
+ end
43
+ end
44
+
45
+ def ==(other)
46
+ self.to_s == other.to_s ? true : false
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,140 @@
1
+ require 'cgi'
2
+
3
+ module Normalic
4
+ class URI
5
+ attr_accessor :scheme, :user,
6
+ :subdomain, :domain, :tld,
7
+ :port, :path, :query_hash, :fragment
8
+
9
+ def initialize(fields={})
10
+ @scheme = fields[:scheme]
11
+ @user = fields[:user]
12
+ @subdomain = fields[:subdomain]
13
+ @domain = fields[:domain]
14
+ @tld = fields[:tld]
15
+ @port = fields[:port]
16
+ @path = fields[:path]
17
+ @query_hash = fields[:query_hash]
18
+ @fragment = fields[:fragment]
19
+ end
20
+
21
+ def self.parse(raw)
22
+ url = raw.to_s.clone
23
+
24
+ # parts before the authority, left-to-right
25
+ scheme = url.cut!(/^\w+:\/\//) and scheme.cut!(/:\/\/$/)
26
+ scheme ||= 'http'
27
+
28
+ # parts after the authority, right-to-left
29
+ fragment = url.cut!(/#.*$/) and fragment.cut!(/^#/)
30
+ query = url.cut!(/\?.*$/) and query.cut!(/^\?/)
31
+ query_hash = query ? self.parse_query(query) : nil
32
+ path = self.normalize_path(url.cut!(/\/.*$/))
33
+
34
+ # parse the authority
35
+ user = url.cut!(/^.+@/) and user.cut!(/@$/)
36
+ port = url.cut!(/:\d+$/) and port.cut!(/^:/)
37
+ tld = url.cut!(/\.\w+$/) and tld.cut!(/^\./)
38
+ domain = url.cut!(/(\.|\A)\w+$/) and domain.cut!(/^\./)
39
+ subdomain = url.empty? ? 'www' : url
40
+
41
+ return nil unless tld && domain
42
+
43
+ self.new(:scheme => scheme,
44
+ :user => user,
45
+ :subdomain => subdomain,
46
+ :domain => domain,
47
+ :tld => tld,
48
+ :port => port,
49
+ :path => path,
50
+ :query_hash => query_hash,
51
+ :fragment => fragment)
52
+ end
53
+
54
+ def to_s
55
+ scheme_s = scheme ? scheme + '://' : nil
56
+ user_s = user ? user + '@' : nil
57
+
58
+ host_s = [subdomain, domain, tld].select do |e|
59
+ e ? true : false
60
+ end.join('.')
61
+ host_s = nil if host_s == ''
62
+
63
+ port_s = port ? ':' + port : nil
64
+ path_s = path
65
+
66
+ if query_hash
67
+ query_s = '?' + query_hash.to_a.collect do |kv|
68
+ kv[0].to_s + '=' + kv[1].to_s
69
+ end.join('&')
70
+ else
71
+ query_s = nil
72
+ end
73
+
74
+ fragment_s = fragment ? '#' + fragment : nil
75
+
76
+ [scheme_s, user_s, host_s, port_s,
77
+ path_s, query_s, fragment_s].select do |e|
78
+ e ? true : false
79
+ end.join
80
+ end
81
+
82
+ def [](field_name)
83
+ begin
84
+ self.send(field_name.to_s)
85
+ rescue NoMethodError => e
86
+ nil
87
+ end
88
+ end
89
+
90
+ def []=(field_name, value)
91
+ begin
92
+ self.send("#{field_name}=", value)
93
+ rescue NoMethodError => e
94
+ nil
95
+ end
96
+ end
97
+
98
+ def ==(other)
99
+ self.to_s == other.to_s ? true : false
100
+ end
101
+
102
+ def match_essential?(other)
103
+ return false unless tld == other.tld
104
+ return false unless domain == other.domain
105
+ return false unless subdomain == other.subdomain ||
106
+ (subdomain == 'www' && !other.subdomain) ||
107
+ (!subdomain && other.subdomain == 'www')
108
+ true
109
+ end
110
+
111
+ private
112
+
113
+ def self.normalize_path(raw)
114
+ parts = raw.to_s.split('/')
115
+ clean_parts = parts.inject([]) do |cpts, pt|
116
+ if pt.empty? || pt == '.'
117
+ cpts
118
+ elsif pt == '..'
119
+ cpts[0..-2]
120
+ else
121
+ cpts + [pt]
122
+ end
123
+ end
124
+ '/' + clean_parts.join('/')
125
+ end
126
+
127
+ def self.parse_query(raw)
128
+ url = raw.to_s.clone
129
+ url.cut!(/^\?/)
130
+ kvs = url.split('&')
131
+
132
+ query_hash = {}
133
+ kvs.each do |kv|
134
+ k, v = kv.split('=')
135
+ query_hash[k] = CGI.unescape(v || '')
136
+ end
137
+ query_hash
138
+ end
139
+ end
140
+ end