RubyGems - normalic - Versions diffs - 0.1.0 → 0.1.1 - Mend

normalic 0.1.0 → 0.1.1

Files changed (11) hide show

data/Manifest +4 -1
data/README.rdoc +20 -4
data/Rakefile +1 -1
data/lib/constants.rb +41950 -41953
data/lib/normalic.rb +23 -117
data/lib/normalic/address.rb +236 -0
data/lib/normalic/phone_number.rb +49 -0
data/lib/normalic/uri.rb +140 -0
data/normalic.gemspec +14 -14
data/spec/normalic_spec.rb +271 -29
metadata +11 -5

data/lib/normalic.rb CHANGED

@@ -1,131 +1,37 @@
-#only handles U.S addresses
-require 'constants'
+require File.expand_path('./normalic/uri', File.dirname(__FILE__))
+require File.expand_path('./normalic/phone_number', File.dirname(__FILE__))
+require File.expand_path('./normalic/address', File.dirname(__FILE__))
 module Normalic
-  class Address
+  private
-    attr_accessor :number, :direction, :street, :type, :city, :state, :zipcode
-    def initialize(fields={})
-      @number = fields[:number]
-      @direction = fields[:direction]
-      @street = fields[:street]
-      @type = fields[:type]
-      @city = fields[:city]
-      @state = fields[:state]
-      @zipcode = fields[:zipcode]
-    end
-    def self.titlize(str)
-      if str
-        str.gsub(/\w+/){|w| w.capitalize}
-      else
-        nil
-      end
-    end
-    def [](field_name)
-      begin
-        self.send(field_name.to_s)
-      rescue NoMethodError => e
-        nil
-      end
-    end
-    def []=(field_name, value)
-      begin
-        self.send("#{field_name}=", value)
-      rescue NoMethodError => e
-        nil
-      end
+  String.class_eval do
+    def detoken!(regex)
+      regex_p = Regexp.new('(\W+|\A)(' + regex.source + ')$', regex.options)
+      oldself = self.clone
+      self.cut!(regex_p) ? oldself.match(regex_p)[2] : nil
     end
-    def to_s
-      #"#{line1},#{" #{city.gsub(/\w+/){|w| w.capitalize}}," if city}#{" #{state.upcase}" if state}#{" " + zipcode if zipcode}".strip
-      "#{line1}#{", #{city}" if city}#{", #{state}" if state}#{" " + zipcode if zipcode}".strip
-      #"#{line1}, #{city}, #{state} #{zipcode}"
+    def detoken_rstrip!(regex)
+      regex_p = Regexp.new('.*((\W|\A)(' + regex.source + ')(\W.*|\Z))', regex.options)
+      oldself = self.clone
+      self.cut!(regex_p, 1) ? oldself.match(regex_p)[3] : nil
     end
-    def line1
-      #"#{number}#{" " + direction.upcase if direction}#{" " + street.gsub(/\w+/){|w| w.capitalize} if street}#{" " + type.capitalize if type}".strip
-      "#{number}#{" " + direction if direction}#{" " + street if street}#{" " + type if type}"
+    def detoken_front!(regex)
+      regex_p = Regexp.new('^(' + regex.source + ')(\W+|\Z)', regex.options)
+      oldself = self.clone
+      self.cut!(regex_p) ? oldself.match(regex_p)[1] : nil
     end
-    #Iteratively take chunks off of the string.
-    def self.parse(address)
-      address.strip!
-      regex = {
-        :unit => /(((\#?\w*)?\W*(su?i?te|p\W*[om]\W*b(?:ox)?|dept|department|ro*m|floor|fl|apt|apartment|unit|box))$)|(\W((su?i?te|p\W*[om]\W*b(?:ox)?|dept|department|ro*m|floor|fl|apt|apartment|unit|box)\W*(\#?\w*)?)\W{0,3}$)/i,
-        :direct => Regexp.new(Directional.keys * '|' + '|' + Directional.values * '\.?|',Regexp::IGNORECASE),
-        :type => Regexp.new('(' + StreetTypes_list * '|' + ')\\W*?$',Regexp::IGNORECASE),
-        :number => /\d+-?\d*/,
-        :fraction => /\d+\/\d+/,
-        :country => /\W+USA$/,
-        :zipcode => /\W+(\d{5}|\d{5}-\d{4})$/,
-        :state => Regexp.new('\W+(' + StateCodes.values * '|' + '|' + StateCodes.keys * '|' + ')$',Regexp::IGNORECASE),
-      }
-      regex[:street] = Regexp.new('((' + regex[:direct].source + ')\\W)?\\W*(.*)\\W*(' + regex[:type].source + ')?', Regexp::IGNORECASE)
-      #get rid of USA at the end
-      country_code = address[regex[:country]]
-      address.gsub!(regex[:country], "")
-      zipcode = address[regex[:zipcode]]
-      address.gsub!(regex[:zipcode], "")
-      zipcode.gsub!(/\W/, "") if zipcode
-      state = address[regex[:state]]
-      address.gsub!(regex[:state], "")
-      state.gsub!(/(^\W*|\W*$)/, "").downcase! if state
-      state = StateCodes[state] || state
-      if ZipCityMap[zipcode]
-        regex[:city] = Regexp.new("\\W+" + ZipCityMap[zipcode] + "$", Regexp::IGNORECASE)
-        regex[:city] = /,.*$/ if !address[regex[:city]]
-        city = ZipCityMap[zipcode]
+    def cut!(regex, match_index=0)
+      if match = self.match(regex)
+        i1, i2 = match.offset(match_index)
+        self[i1...i2] = ''
+        match[match_index]
       else
-        regex[:city] = /,.*$/
-        city = address[regex[:city]]
-        city.gsub!(/(^\W*|\W*$)/, "").downcase! if city
-      end
-      address.gsub!(regex[:city], "")
-      address.gsub!(regex[:unit], "")
-      address.gsub!(Regexp.new('\W(' + regex[:direct].source + ')\\W{0,3}$', Regexp::IGNORECASE), "")
-      type = address[regex[:type]]
-      address.gsub!(regex[:type], "")
-      type.gsub!(/(^\W*|\W*$)/, "").downcase! if type
-      type = StreetTypes[type] || type if type
-      if address =~ /(\Wand\W|\W\&\W)/
-        #intersections.  print as is
-        address.gsub!(/(\Wand\W|\W\&\W)/, " and ")
-        arr = ["", address, "", ""]
-      else
-        regex[:address] = Regexp.new('^\W*(' + regex[:number].source + '\\W)?\W*(?:' + regex[:fraction].source + '\W*)?' + regex[:street].source, Regexp::IGNORECASE)
-        arr = regex[:address].match(address).to_a
-      end
-      number = arr[1].strip if arr[1]
-      if arr[2] && (!arr[4] || arr[4].empty?)
-        street = arr[2].strip.downcase
-      else
-        dir = Directional[arr[2].strip.downcase] || arr[2].strip.downcase if arr[2]
-        dir.gsub!(/\W/, "") if dir
+        nil
       end
-      street = arr[4].strip.downcase if arr[4] && !street
-      self.new(
-        {
-          :number => number,
-          :direction => dir ? dir.upcase : nil,
-          :street => titlize(street),
-          :type => titlize(type),
-          :city => titlize(city),
-          :state => state ? state.upcase : nil,
-          :zipcode => zipcode
-        }
-      )
     end
   end
 end

data/lib/normalic/address.rb ADDED

@@ -0,0 +1,236 @@
+require File.expand_path('../constants', File.dirname(__FILE__))
+module Normalic
+  # only handles U.S. addresses
+  class Address
+    UNIT_TYPE_REGEX = /ap(artmen)?t|box|building|bldg|dep(artmen)?t|fl(oor)?|po( box)?|r(oo)?m|s(ui)?te|un(i)?t/
+    REGEXES = {:country => /usa/,
+               :zipcode => /\d{5}(-\d{4})?/,
+               :state => Regexp.new(STATE_CODES.values * '|' + '|' +
+                                    STATE_CODES.keys * '|'),
+               :city => /\w+(\s\w+)*/,
+               :unit => Regexp.new('(#\w+)|' +
+                                   '(#?\w+\W+(' + UNIT_TYPE_REGEX.source + '))|' +
+                                   '((' + UNIT_TYPE_REGEX.source + ')\W+#?\w+)'),
+               :directional => Regexp.new(DIRECTIONAL.keys * '|' + '|' +
+                                          DIRECTIONAL.values * '|'),
+               :type => Regexp.new(STREET_TYPES_LIST * '|'),
+               :number => /\d+/,
+               :street => /\w+(\s\w+)*/,
+               :intersection => /(.+)\W+(and|&)\W+(.+)/}
+    attr_accessor :number, :direction, :street, :type, :city, :state, :zipcode, :intersection
+    def initialize(fields={})
+      @number = fields[:number]
+      @direction = fields[:direction]
+      @street = fields[:street]
+      @type = fields[:type]
+      @city = fields[:city]
+      @state = fields[:state]
+      @zipcode = fields[:zipcode]
+      @intersection = fields[:intersection] || false
+    end
+    def self.parse(raw)
+      address = raw.to_s
+      clean = self.clean(address)
+      tokens = self.tokenize(clean)
+      normd = self.normalize(tokens)
+      self.new(normd)
+    end
+    def [](field_name)
+      begin
+        self.send(field_name.to_s)
+      rescue NoMethodError => e
+        nil
+      end
+    end
+    def []=(field_name, value)
+      begin
+        self.send("#{field_name}=", value)
+      rescue NoMethodError => e
+        nil
+      end
+    end
+    def to_s
+      parts = [line1, city, state].select {|e| e ? true : false}
+      parts.join(', ') + (zipcode ? ' ' + zipcode : '')
+    end
+    def line1
+      if intersection
+        parts1 = [direction[0], street[0], type[0]].select {|e| e ? true : false}
+        parts2 = [direction[0], street[0], type[0]].select {|e| e ? true : false}
+        parts1.join(' ') + " and " + parts2.join(' ')
+      else
+        parts = [number, direction, street, type].select {|e| e ? true : false}
+        parts.join(' ')
+      end
+    end
+    def ==(other)
+      self.to_s == other.to_s ? true : false
+    end
+    def match_essential?(other)
+      return false unless zipcode == other.zipcode
+      return false unless state == other.state
+      return false unless city == other.city
+      return false unless street == other.street
+      return false unless number == other.number
+      return false unless !type || !other.type ||
+                          type == other.type
+      return false unless !direction || !other.direction ||
+                          direction == other.direction
+      true
+    end
+    private
+    def self.titlize(str)
+      if str
+        str.gsub(/\w+/){|w| w.capitalize}
+      else
+        nil
+      end
+    end
+    def self.clean(address)
+      address = address.clone
+      address.downcase!
+      address.gsub!("\n",', ')
+      address.strip!
+      address.gsub!(/\s+/,' ')
+      address.gsub!('.', '')
+      address
+    end
+    def self.tokenize(address)
+      address = address.clone
+      address.detoken!(REGEXES[:country])
+      zipcode = address.detoken!(REGEXES[:zipcode])
+      state = address.detoken!(REGEXES[:state])
+      if zipcode && ZIP_CITY_MAP[zipcode] &&
+         (zipcity = ZIP_CITY_MAP[zipcode][:city])
+        city = address.detoken!(Regexp.new(zipcity))
+      end
+      unless city
+        city = address.cut!(Regexp.new('\W*,\W+(' + REGEXES[:city].source +
+                                       ')\W*$'))
+        city = city.cut!(REGEXES[:city]) if city
+      end
+      address.detoken_rstrip!(REGEXES[:unit])
+      if m = address.match(REGEXES[:intersection])
+        intersection = true
+        t1, s1, d1 = self.tokenize_street(m[1], false)
+        t2, s2, d2 = self.tokenize_street(m[3], false)
+        type = [t1, t2]
+        street = [s1, s2]
+        direction = [d1, d2]
+        number = nil
+      else
+        intersection = false
+        type, street, direction, number = self.tokenize_street(address)
+      end
+      {:zipcode => zipcode,
+       :state => state,
+       :city => city,
+       :type => type,
+       :street => street,
+       :direction => direction,
+       :number => number,
+       :intersection => intersection}
+    end
+    def self.tokenize_street(address, has_number=true)
+      address = address.clone
+      number = has_number ? address.detoken_front!(REGEXES[:number]) : nil
+      direction = address.detoken_front!(REGEXES[:directional]) ||
+                  address.detoken_rstrip!(REGEXES[:directional])
+      type = address.detoken_rstrip!(REGEXES[:type])
+      street = address.detoken!(REGEXES[:street])
+      if has_number
+        return type, street, direction, number
+      else
+        return type, street, direction
+      end
+    end
+    def self.normalize(tokens)
+      tokens = tokens.clone
+      tokens[:zipcode] = self.normalize_zipcode(tokens[:zipcode])
+      tokens[:state] = self.normalize_state(tokens[:state], tokens[:zipcode])
+      tokens[:city] = self.normalize_city(tokens[:city], tokens[:zipcode])
+      if tokens[:intersection]
+        tokens[:type].collect! {|t| self.normalize_type(t)}
+        tokens[:street].collect! {|s| self.normalize_street(s)}
+        tokens[:direction].collect! {|d| self.normalize_direction(d)}
+      else
+        tokens[:type] = self.normalize_type(tokens[:type])
+        tokens[:street] = self.normalize_street(tokens[:street])
+        tokens[:direction] = self.normalize_direction(tokens[:direction])
+      end
+      tokens
+    end
+    def self.normalize_zipcode(zipcode)
+      zipcode ? zipcode[0,5] : nil
+    end
+    def self.normalize_state(state, zipcode=nil)
+      if zipcode && ZIP_CITY_MAP[zipcode]
+        state = ZIP_CITY_MAP[zipcode][:state]
+        state.upcase
+      elsif state
+        state = STATE_CODES[state] || state
+        state.upcase
+      else
+        nil
+      end
+    end
+    def self.normalize_city(city, zipcode=nil)
+      city = ZIP_CITY_MAP[zipcode][:city] if zipcode && ZIP_CITY_MAP[zipcode]
+      city ? self.titlize(city) : nil
+    end
+    def self.normalize_type(type)
+      if type
+        type = STREET_TYPES[type] || type
+        self.titlize(type) + '.'
+      else
+        nil
+      end
+    end
+    def self.normalize_street(street)
+      street ? self.titlize(street) : nil
+    end
+    def self.normalize_direction(direction)
+      if direction
+        direction = DIRECTIONAL[direction] || direction
+        direction.upcase
+      else
+        nil
+      end
+    end
+  end
+end

data/lib/normalic/phone_number.rb ADDED

@@ -0,0 +1,49 @@
+module Normalic
+  # only handles U.S. phone numbers
+  class PhoneNumber
+    attr_accessor :npa, :nxx, :slid
+    def initialize(fields={})
+      @npa = fields[:npa]
+      @nxx = fields[:nxx]
+      @slid = fields[:slid]
+    end
+    def self.parse(raw)
+      digs = raw.to_s.gsub(/[^\d]/,'')
+      while digs != (trim = digs.gsub(/^[01]/,''))
+        digs = trim
+      end
+      if digs.length < 10
+        return nil
+      end
+      self.new(:npa => digs[0,3],
+               :nxx => digs[3,3],
+               :slid => digs[6,4])
+    end
+    def to_s
+      "#{npa} #{nxx} #{slid}"
+    end
+    def [](field_name)
+      begin
+        self.send(field_name.to_s)
+      rescue NoMethodError => e
+        nil
+      end
+    end
+    def []=(field_name, value)
+      begin
+        self.send("#{field_name}=", value)
+      rescue NoMethodError => e
+        nil
+      end
+    end
+    def ==(other)
+      self.to_s == other.to_s ? true : false
+    end
+  end
+end

data/lib/normalic/uri.rb ADDED

@@ -0,0 +1,140 @@
+require 'cgi'
+module Normalic
+  class URI
+    attr_accessor :scheme, :user,
+                  :subdomain, :domain, :tld,
+                  :port, :path, :query_hash, :fragment
+    def initialize(fields={})
+      @scheme = fields[:scheme]
+      @user = fields[:user]
+      @subdomain = fields[:subdomain]
+      @domain = fields[:domain]
+      @tld = fields[:tld]
+      @port = fields[:port]
+      @path = fields[:path]
+      @query_hash = fields[:query_hash]
+      @fragment = fields[:fragment]
+    end
+    def self.parse(raw)
+      url = raw.to_s.clone
+      # parts before the authority, left-to-right
+      scheme = url.cut!(/^\w+:\/\//) and scheme.cut!(/:\/\/$/)
+      scheme ||= 'http'
+      # parts after the authority, right-to-left
+      fragment = url.cut!(/#.*$/) and fragment.cut!(/^#/)
+      query = url.cut!(/\?.*$/) and query.cut!(/^\?/)
+      query_hash = query ? self.parse_query(query) : nil
+      path = self.normalize_path(url.cut!(/\/.*$/))
+      # parse the authority
+      user = url.cut!(/^.+@/) and user.cut!(/@$/)
+      port = url.cut!(/:\d+$/) and port.cut!(/^:/)
+      tld = url.cut!(/\.\w+$/) and tld.cut!(/^\./)
+      domain = url.cut!(/(\.|\A)\w+$/) and domain.cut!(/^\./)
+      subdomain = url.empty? ? 'www' : url
+      return nil unless tld && domain
+      self.new(:scheme => scheme,
+               :user => user,
+               :subdomain => subdomain,
+               :domain => domain,
+               :tld => tld,
+               :port => port,
+               :path => path,
+               :query_hash => query_hash,
+               :fragment => fragment)
+    end
+    def to_s
+      scheme_s = scheme ? scheme + '://' : nil
+      user_s = user ? user + '@' : nil
+      host_s = [subdomain, domain, tld].select do |e|
+        e ? true : false
+      end.join('.')
+      host_s = nil if host_s == ''
+      port_s = port ? ':' + port : nil
+      path_s = path
+      if query_hash
+        query_s = '?' + query_hash.to_a.collect do |kv|
+          kv[0].to_s + '=' + kv[1].to_s
+        end.join('&')
+      else
+        query_s = nil
+      end
+      fragment_s = fragment ? '#' + fragment : nil
+      [scheme_s, user_s, host_s, port_s,
+       path_s, query_s, fragment_s].select do |e|
+         e ? true : false
+       end.join
+    end
+    def [](field_name)
+      begin
+        self.send(field_name.to_s)
+      rescue NoMethodError => e
+        nil
+      end
+    end
+    def []=(field_name, value)
+      begin
+        self.send("#{field_name}=", value)
+      rescue NoMethodError => e
+        nil
+      end
+    end
+    def ==(other)
+      self.to_s == other.to_s ? true : false
+    end
+    def match_essential?(other)
+      return false unless tld == other.tld
+      return false unless domain == other.domain
+      return false unless subdomain == other.subdomain ||
+                          (subdomain == 'www' && !other.subdomain) ||
+                          (!subdomain && other.subdomain == 'www')
+      true
+    end
+    private
+    def self.normalize_path(raw)
+      parts = raw.to_s.split('/')
+      clean_parts = parts.inject([]) do |cpts, pt|
+        if pt.empty? || pt == '.'
+          cpts
+        elsif pt == '..'
+          cpts[0..-2]
+        else
+          cpts + [pt]
+        end
+      end
+      '/' + clean_parts.join('/')
+    end
+    def self.parse_query(raw)
+      url = raw.to_s.clone
+      url.cut!(/^\?/)
+      kvs = url.split('&')
+      query_hash = {}
+      kvs.each do |kv|
+        k, v = kv.split('=')
+        query_hash[k] = CGI.unescape(v || '')
+      end
+      query_hash
+    end
+  end
+end