normalic 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,131 +1,37 @@
1
- #only handles U.S addresses
2
- require 'constants'
1
+ require File.expand_path('./normalic/uri', File.dirname(__FILE__))
2
+ require File.expand_path('./normalic/phone_number', File.dirname(__FILE__))
3
+ require File.expand_path('./normalic/address', File.dirname(__FILE__))
3
4
 
4
5
  module Normalic
5
- class Address
6
+ private
6
7
 
7
- attr_accessor :number, :direction, :street, :type, :city, :state, :zipcode
8
-
9
- def initialize(fields={})
10
- @number = fields[:number]
11
- @direction = fields[:direction]
12
- @street = fields[:street]
13
- @type = fields[:type]
14
- @city = fields[:city]
15
- @state = fields[:state]
16
- @zipcode = fields[:zipcode]
17
- end
18
-
19
- def self.titlize(str)
20
- if str
21
- str.gsub(/\w+/){|w| w.capitalize}
22
- else
23
- nil
24
- end
25
- end
26
-
27
- def [](field_name)
28
- begin
29
- self.send(field_name.to_s)
30
- rescue NoMethodError => e
31
- nil
32
- end
33
- end
34
-
35
- def []=(field_name, value)
36
- begin
37
- self.send("#{field_name}=", value)
38
- rescue NoMethodError => e
39
- nil
40
- end
8
+ String.class_eval do
9
+ def detoken!(regex)
10
+ regex_p = Regexp.new('(\W+|\A)(' + regex.source + ')$', regex.options)
11
+ oldself = self.clone
12
+ self.cut!(regex_p) ? oldself.match(regex_p)[2] : nil
41
13
  end
42
14
 
43
- def to_s
44
- #"#{line1},#{" #{city.gsub(/\w+/){|w| w.capitalize}}," if city}#{" #{state.upcase}" if state}#{" " + zipcode if zipcode}".strip
45
- "#{line1}#{", #{city}" if city}#{", #{state}" if state}#{" " + zipcode if zipcode}".strip
46
- #"#{line1}, #{city}, #{state} #{zipcode}"
15
+ def detoken_rstrip!(regex)
16
+ regex_p = Regexp.new('.*((\W|\A)(' + regex.source + ')(\W.*|\Z))', regex.options)
17
+ oldself = self.clone
18
+ self.cut!(regex_p, 1) ? oldself.match(regex_p)[3] : nil
47
19
  end
48
20
 
49
- def line1
50
- #"#{number}#{" " + direction.upcase if direction}#{" " + street.gsub(/\w+/){|w| w.capitalize} if street}#{" " + type.capitalize if type}".strip
51
- "#{number}#{" " + direction if direction}#{" " + street if street}#{" " + type if type}"
21
+ def detoken_front!(regex)
22
+ regex_p = Regexp.new('^(' + regex.source + ')(\W+|\Z)', regex.options)
23
+ oldself = self.clone
24
+ self.cut!(regex_p) ? oldself.match(regex_p)[1] : nil
52
25
  end
53
26
 
54
- #Iteratively take chunks off of the string.
55
- def self.parse(address)
56
- address.strip!
57
- regex = {
58
- :unit => /(((\#?\w*)?\W*(su?i?te|p\W*[om]\W*b(?:ox)?|dept|department|ro*m|floor|fl|apt|apartment|unit|box))$)|(\W((su?i?te|p\W*[om]\W*b(?:ox)?|dept|department|ro*m|floor|fl|apt|apartment|unit|box)\W*(\#?\w*)?)\W{0,3}$)/i,
59
- :direct => Regexp.new(Directional.keys * '|' + '|' + Directional.values * '\.?|',Regexp::IGNORECASE),
60
- :type => Regexp.new('(' + StreetTypes_list * '|' + ')\\W*?$',Regexp::IGNORECASE),
61
- :number => /\d+-?\d*/,
62
- :fraction => /\d+\/\d+/,
63
- :country => /\W+USA$/,
64
- :zipcode => /\W+(\d{5}|\d{5}-\d{4})$/,
65
- :state => Regexp.new('\W+(' + StateCodes.values * '|' + '|' + StateCodes.keys * '|' + ')$',Regexp::IGNORECASE),
66
- }
67
- regex[:street] = Regexp.new('((' + regex[:direct].source + ')\\W)?\\W*(.*)\\W*(' + regex[:type].source + ')?', Regexp::IGNORECASE)
68
-
69
- #get rid of USA at the end
70
- country_code = address[regex[:country]]
71
- address.gsub!(regex[:country], "")
72
- zipcode = address[regex[:zipcode]]
73
- address.gsub!(regex[:zipcode], "")
74
- zipcode.gsub!(/\W/, "") if zipcode
75
-
76
- state = address[regex[:state]]
77
- address.gsub!(regex[:state], "")
78
- state.gsub!(/(^\W*|\W*$)/, "").downcase! if state
79
- state = StateCodes[state] || state
80
-
81
- if ZipCityMap[zipcode]
82
- regex[:city] = Regexp.new("\\W+" + ZipCityMap[zipcode] + "$", Regexp::IGNORECASE)
83
- regex[:city] = /,.*$/ if !address[regex[:city]]
84
- city = ZipCityMap[zipcode]
27
+ def cut!(regex, match_index=0)
28
+ if match = self.match(regex)
29
+ i1, i2 = match.offset(match_index)
30
+ self[i1...i2] = ''
31
+ match[match_index]
85
32
  else
86
- regex[:city] = /,.*$/
87
- city = address[regex[:city]]
88
- city.gsub!(/(^\W*|\W*$)/, "").downcase! if city
89
- end
90
-
91
- address.gsub!(regex[:city], "")
92
- address.gsub!(regex[:unit], "")
93
- address.gsub!(Regexp.new('\W(' + regex[:direct].source + ')\\W{0,3}$', Regexp::IGNORECASE), "")
94
-
95
- type = address[regex[:type]]
96
- address.gsub!(regex[:type], "")
97
- type.gsub!(/(^\W*|\W*$)/, "").downcase! if type
98
- type = StreetTypes[type] || type if type
99
-
100
- if address =~ /(\Wand\W|\W\&\W)/
101
- #intersections. print as is
102
- address.gsub!(/(\Wand\W|\W\&\W)/, " and ")
103
- arr = ["", address, "", ""]
104
- else
105
- regex[:address] = Regexp.new('^\W*(' + regex[:number].source + '\\W)?\W*(?:' + regex[:fraction].source + '\W*)?' + regex[:street].source, Regexp::IGNORECASE)
106
- arr = regex[:address].match(address).to_a
107
- end
108
-
109
- number = arr[1].strip if arr[1]
110
- if arr[2] && (!arr[4] || arr[4].empty?)
111
- street = arr[2].strip.downcase
112
- else
113
- dir = Directional[arr[2].strip.downcase] || arr[2].strip.downcase if arr[2]
114
- dir.gsub!(/\W/, "") if dir
33
+ nil
115
34
  end
116
- street = arr[4].strip.downcase if arr[4] && !street
117
-
118
- self.new(
119
- {
120
- :number => number,
121
- :direction => dir ? dir.upcase : nil,
122
- :street => titlize(street),
123
- :type => titlize(type),
124
- :city => titlize(city),
125
- :state => state ? state.upcase : nil,
126
- :zipcode => zipcode
127
- }
128
- )
129
35
  end
130
36
  end
131
37
  end
@@ -0,0 +1,236 @@
1
+ require File.expand_path('../constants', File.dirname(__FILE__))
2
+
3
+ module Normalic
4
+ # only handles U.S. addresses
5
+ class Address
6
+ UNIT_TYPE_REGEX = /ap(artmen)?t|box|building|bldg|dep(artmen)?t|fl(oor)?|po( box)?|r(oo)?m|s(ui)?te|un(i)?t/
7
+ REGEXES = {:country => /usa/,
8
+ :zipcode => /\d{5}(-\d{4})?/,
9
+ :state => Regexp.new(STATE_CODES.values * '|' + '|' +
10
+ STATE_CODES.keys * '|'),
11
+ :city => /\w+(\s\w+)*/,
12
+ :unit => Regexp.new('(#\w+)|' +
13
+ '(#?\w+\W+(' + UNIT_TYPE_REGEX.source + '))|' +
14
+ '((' + UNIT_TYPE_REGEX.source + ')\W+#?\w+)'),
15
+ :directional => Regexp.new(DIRECTIONAL.keys * '|' + '|' +
16
+ DIRECTIONAL.values * '|'),
17
+ :type => Regexp.new(STREET_TYPES_LIST * '|'),
18
+ :number => /\d+/,
19
+ :street => /\w+(\s\w+)*/,
20
+ :intersection => /(.+)\W+(and|&)\W+(.+)/}
21
+
22
+ attr_accessor :number, :direction, :street, :type, :city, :state, :zipcode, :intersection
23
+
24
+ def initialize(fields={})
25
+ @number = fields[:number]
26
+ @direction = fields[:direction]
27
+ @street = fields[:street]
28
+ @type = fields[:type]
29
+ @city = fields[:city]
30
+ @state = fields[:state]
31
+ @zipcode = fields[:zipcode]
32
+ @intersection = fields[:intersection] || false
33
+ end
34
+
35
+ def self.parse(raw)
36
+ address = raw.to_s
37
+ clean = self.clean(address)
38
+ tokens = self.tokenize(clean)
39
+ normd = self.normalize(tokens)
40
+
41
+ self.new(normd)
42
+ end
43
+
44
+ def [](field_name)
45
+ begin
46
+ self.send(field_name.to_s)
47
+ rescue NoMethodError => e
48
+ nil
49
+ end
50
+ end
51
+
52
+ def []=(field_name, value)
53
+ begin
54
+ self.send("#{field_name}=", value)
55
+ rescue NoMethodError => e
56
+ nil
57
+ end
58
+ end
59
+
60
+ def to_s
61
+ parts = [line1, city, state].select {|e| e ? true : false}
62
+ parts.join(', ') + (zipcode ? ' ' + zipcode : '')
63
+ end
64
+
65
+ def line1
66
+ if intersection
67
+ parts1 = [direction[0], street[0], type[0]].select {|e| e ? true : false}
68
+ parts2 = [direction[0], street[0], type[0]].select {|e| e ? true : false}
69
+ parts1.join(' ') + " and " + parts2.join(' ')
70
+ else
71
+ parts = [number, direction, street, type].select {|e| e ? true : false}
72
+ parts.join(' ')
73
+ end
74
+ end
75
+
76
+ def ==(other)
77
+ self.to_s == other.to_s ? true : false
78
+ end
79
+
80
+ def match_essential?(other)
81
+ return false unless zipcode == other.zipcode
82
+ return false unless state == other.state
83
+ return false unless city == other.city
84
+ return false unless street == other.street
85
+ return false unless number == other.number
86
+ return false unless !type || !other.type ||
87
+ type == other.type
88
+ return false unless !direction || !other.direction ||
89
+ direction == other.direction
90
+ true
91
+ end
92
+
93
+ private
94
+
95
+ def self.titlize(str)
96
+ if str
97
+ str.gsub(/\w+/){|w| w.capitalize}
98
+ else
99
+ nil
100
+ end
101
+ end
102
+
103
+ def self.clean(address)
104
+ address = address.clone
105
+
106
+ address.downcase!
107
+ address.gsub!("\n",', ')
108
+ address.strip!
109
+ address.gsub!(/\s+/,' ')
110
+ address.gsub!('.', '')
111
+
112
+ address
113
+ end
114
+
115
+ def self.tokenize(address)
116
+ address = address.clone
117
+
118
+ address.detoken!(REGEXES[:country])
119
+ zipcode = address.detoken!(REGEXES[:zipcode])
120
+
121
+ state = address.detoken!(REGEXES[:state])
122
+
123
+ if zipcode && ZIP_CITY_MAP[zipcode] &&
124
+ (zipcity = ZIP_CITY_MAP[zipcode][:city])
125
+ city = address.detoken!(Regexp.new(zipcity))
126
+ end
127
+ unless city
128
+ city = address.cut!(Regexp.new('\W*,\W+(' + REGEXES[:city].source +
129
+ ')\W*$'))
130
+ city = city.cut!(REGEXES[:city]) if city
131
+ end
132
+
133
+ address.detoken_rstrip!(REGEXES[:unit])
134
+
135
+ if m = address.match(REGEXES[:intersection])
136
+ intersection = true
137
+ t1, s1, d1 = self.tokenize_street(m[1], false)
138
+ t2, s2, d2 = self.tokenize_street(m[3], false)
139
+ type = [t1, t2]
140
+ street = [s1, s2]
141
+ direction = [d1, d2]
142
+ number = nil
143
+ else
144
+ intersection = false
145
+ type, street, direction, number = self.tokenize_street(address)
146
+ end
147
+
148
+ {:zipcode => zipcode,
149
+ :state => state,
150
+ :city => city,
151
+ :type => type,
152
+ :street => street,
153
+ :direction => direction,
154
+ :number => number,
155
+ :intersection => intersection}
156
+ end
157
+
158
+ def self.tokenize_street(address, has_number=true)
159
+ address = address.clone
160
+
161
+ number = has_number ? address.detoken_front!(REGEXES[:number]) : nil
162
+ direction = address.detoken_front!(REGEXES[:directional]) ||
163
+ address.detoken_rstrip!(REGEXES[:directional])
164
+ type = address.detoken_rstrip!(REGEXES[:type])
165
+ street = address.detoken!(REGEXES[:street])
166
+ if has_number
167
+ return type, street, direction, number
168
+ else
169
+ return type, street, direction
170
+ end
171
+ end
172
+
173
+ def self.normalize(tokens)
174
+ tokens = tokens.clone
175
+
176
+ tokens[:zipcode] = self.normalize_zipcode(tokens[:zipcode])
177
+ tokens[:state] = self.normalize_state(tokens[:state], tokens[:zipcode])
178
+ tokens[:city] = self.normalize_city(tokens[:city], tokens[:zipcode])
179
+
180
+ if tokens[:intersection]
181
+ tokens[:type].collect! {|t| self.normalize_type(t)}
182
+ tokens[:street].collect! {|s| self.normalize_street(s)}
183
+ tokens[:direction].collect! {|d| self.normalize_direction(d)}
184
+ else
185
+ tokens[:type] = self.normalize_type(tokens[:type])
186
+ tokens[:street] = self.normalize_street(tokens[:street])
187
+ tokens[:direction] = self.normalize_direction(tokens[:direction])
188
+ end
189
+
190
+ tokens
191
+ end
192
+
193
+ def self.normalize_zipcode(zipcode)
194
+ zipcode ? zipcode[0,5] : nil
195
+ end
196
+
197
+ def self.normalize_state(state, zipcode=nil)
198
+ if zipcode && ZIP_CITY_MAP[zipcode]
199
+ state = ZIP_CITY_MAP[zipcode][:state]
200
+ state.upcase
201
+ elsif state
202
+ state = STATE_CODES[state] || state
203
+ state.upcase
204
+ else
205
+ nil
206
+ end
207
+ end
208
+
209
+ def self.normalize_city(city, zipcode=nil)
210
+ city = ZIP_CITY_MAP[zipcode][:city] if zipcode && ZIP_CITY_MAP[zipcode]
211
+ city ? self.titlize(city) : nil
212
+ end
213
+
214
+ def self.normalize_type(type)
215
+ if type
216
+ type = STREET_TYPES[type] || type
217
+ self.titlize(type) + '.'
218
+ else
219
+ nil
220
+ end
221
+ end
222
+
223
+ def self.normalize_street(street)
224
+ street ? self.titlize(street) : nil
225
+ end
226
+
227
+ def self.normalize_direction(direction)
228
+ if direction
229
+ direction = DIRECTIONAL[direction] || direction
230
+ direction.upcase
231
+ else
232
+ nil
233
+ end
234
+ end
235
+ end
236
+ end
@@ -0,0 +1,49 @@
1
+ module Normalic
2
+ # only handles U.S. phone numbers
3
+ class PhoneNumber
4
+ attr_accessor :npa, :nxx, :slid
5
+
6
+ def initialize(fields={})
7
+ @npa = fields[:npa]
8
+ @nxx = fields[:nxx]
9
+ @slid = fields[:slid]
10
+ end
11
+
12
+ def self.parse(raw)
13
+ digs = raw.to_s.gsub(/[^\d]/,'')
14
+ while digs != (trim = digs.gsub(/^[01]/,''))
15
+ digs = trim
16
+ end
17
+ if digs.length < 10
18
+ return nil
19
+ end
20
+ self.new(:npa => digs[0,3],
21
+ :nxx => digs[3,3],
22
+ :slid => digs[6,4])
23
+ end
24
+
25
+ def to_s
26
+ "#{npa} #{nxx} #{slid}"
27
+ end
28
+
29
+ def [](field_name)
30
+ begin
31
+ self.send(field_name.to_s)
32
+ rescue NoMethodError => e
33
+ nil
34
+ end
35
+ end
36
+
37
+ def []=(field_name, value)
38
+ begin
39
+ self.send("#{field_name}=", value)
40
+ rescue NoMethodError => e
41
+ nil
42
+ end
43
+ end
44
+
45
+ def ==(other)
46
+ self.to_s == other.to_s ? true : false
47
+ end
48
+ end
49
+ end
@@ -0,0 +1,140 @@
1
+ require 'cgi'
2
+
3
+ module Normalic
4
+ class URI
5
+ attr_accessor :scheme, :user,
6
+ :subdomain, :domain, :tld,
7
+ :port, :path, :query_hash, :fragment
8
+
9
+ def initialize(fields={})
10
+ @scheme = fields[:scheme]
11
+ @user = fields[:user]
12
+ @subdomain = fields[:subdomain]
13
+ @domain = fields[:domain]
14
+ @tld = fields[:tld]
15
+ @port = fields[:port]
16
+ @path = fields[:path]
17
+ @query_hash = fields[:query_hash]
18
+ @fragment = fields[:fragment]
19
+ end
20
+
21
+ def self.parse(raw)
22
+ url = raw.to_s.clone
23
+
24
+ # parts before the authority, left-to-right
25
+ scheme = url.cut!(/^\w+:\/\//) and scheme.cut!(/:\/\/$/)
26
+ scheme ||= 'http'
27
+
28
+ # parts after the authority, right-to-left
29
+ fragment = url.cut!(/#.*$/) and fragment.cut!(/^#/)
30
+ query = url.cut!(/\?.*$/) and query.cut!(/^\?/)
31
+ query_hash = query ? self.parse_query(query) : nil
32
+ path = self.normalize_path(url.cut!(/\/.*$/))
33
+
34
+ # parse the authority
35
+ user = url.cut!(/^.+@/) and user.cut!(/@$/)
36
+ port = url.cut!(/:\d+$/) and port.cut!(/^:/)
37
+ tld = url.cut!(/\.\w+$/) and tld.cut!(/^\./)
38
+ domain = url.cut!(/(\.|\A)\w+$/) and domain.cut!(/^\./)
39
+ subdomain = url.empty? ? 'www' : url
40
+
41
+ return nil unless tld && domain
42
+
43
+ self.new(:scheme => scheme,
44
+ :user => user,
45
+ :subdomain => subdomain,
46
+ :domain => domain,
47
+ :tld => tld,
48
+ :port => port,
49
+ :path => path,
50
+ :query_hash => query_hash,
51
+ :fragment => fragment)
52
+ end
53
+
54
+ def to_s
55
+ scheme_s = scheme ? scheme + '://' : nil
56
+ user_s = user ? user + '@' : nil
57
+
58
+ host_s = [subdomain, domain, tld].select do |e|
59
+ e ? true : false
60
+ end.join('.')
61
+ host_s = nil if host_s == ''
62
+
63
+ port_s = port ? ':' + port : nil
64
+ path_s = path
65
+
66
+ if query_hash
67
+ query_s = '?' + query_hash.to_a.collect do |kv|
68
+ kv[0].to_s + '=' + kv[1].to_s
69
+ end.join('&')
70
+ else
71
+ query_s = nil
72
+ end
73
+
74
+ fragment_s = fragment ? '#' + fragment : nil
75
+
76
+ [scheme_s, user_s, host_s, port_s,
77
+ path_s, query_s, fragment_s].select do |e|
78
+ e ? true : false
79
+ end.join
80
+ end
81
+
82
+ def [](field_name)
83
+ begin
84
+ self.send(field_name.to_s)
85
+ rescue NoMethodError => e
86
+ nil
87
+ end
88
+ end
89
+
90
+ def []=(field_name, value)
91
+ begin
92
+ self.send("#{field_name}=", value)
93
+ rescue NoMethodError => e
94
+ nil
95
+ end
96
+ end
97
+
98
+ def ==(other)
99
+ self.to_s == other.to_s ? true : false
100
+ end
101
+
102
+ def match_essential?(other)
103
+ return false unless tld == other.tld
104
+ return false unless domain == other.domain
105
+ return false unless subdomain == other.subdomain ||
106
+ (subdomain == 'www' && !other.subdomain) ||
107
+ (!subdomain && other.subdomain == 'www')
108
+ true
109
+ end
110
+
111
+ private
112
+
113
+ def self.normalize_path(raw)
114
+ parts = raw.to_s.split('/')
115
+ clean_parts = parts.inject([]) do |cpts, pt|
116
+ if pt.empty? || pt == '.'
117
+ cpts
118
+ elsif pt == '..'
119
+ cpts[0..-2]
120
+ else
121
+ cpts + [pt]
122
+ end
123
+ end
124
+ '/' + clean_parts.join('/')
125
+ end
126
+
127
+ def self.parse_query(raw)
128
+ url = raw.to_s.clone
129
+ url.cut!(/^\?/)
130
+ kvs = url.split('&')
131
+
132
+ query_hash = {}
133
+ kvs.each do |kv|
134
+ k, v = kv.split('=')
135
+ query_hash[k] = CGI.unescape(v || '')
136
+ end
137
+ query_hash
138
+ end
139
+ end
140
+ end