address_extractor 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,23 @@
1
+ h4. Copyright and License
2
+
3
+ The MIT License
4
+
5
+ Copyright (c) 2008 Jim Garvin
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
@@ -0,0 +1,6 @@
1
+ lib/address_extractor.rb
2
+ LICENSE.textile
3
+ Manifest
4
+ Rakefile
5
+ README.textile
6
+ test/test_address_extractor.rb
@@ -0,0 +1,48 @@
1
+ h1. AddressExtractor
2
+
3
+ Find and/or replace mailing addresses in strings.
4
+
5
+ h2. Examples
6
+
7
+ <pre><code>
8
+ require 'rubygems'
9
+ require 'address_extractor'
10
+
11
+ string = <<EOF
12
+ Please send the package to 123 Foo St., Someplace FL
13
+
14
+ My phone number is 123-1234 and St. Marc of Israel can be reached
15
+ via mail at:
16
+ 123 Goob Avenue
17
+ Apt 123
18
+ Nice Town CA 123456
19
+ EOF
20
+
21
+ # Find first address
22
+ AddressExtractor.first_address(string) # => { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
23
+
24
+ # Find all addresses
25
+ AddressExtractor.find_addresses(string) # =>
26
+ # [
27
+ # { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
28
+ # { :street1 => "123 Goob Avenue.", :street2 => "Apt 123", :city => "Nice Town", :state => "CA", :zip => "123456" }
29
+ # ]
30
+
31
+ # Do a gsub on first address
32
+ new_string = AddressExtractor.replace_first_address(string) do |address_hash, address_string|
33
+ map_link_to(address_string)
34
+ end
35
+
36
+ # Do a gsub on all addresses
37
+ new_string = AddressExtractor.replace_addresses(string) do |address_hash, address_string|
38
+ map_link_to(address_string)
39
+ end
40
+ </code></pre>
41
+
42
+ h3. About
43
+
44
+ Written by Jim Garvin ("coderifous":http://github.com/coderifous) at RubyConf '08 at the request of Chris Murphy ("chmurph2":http://github.com/chmurph2) and Ryan McGeary ("rmm5t":http://github.com/rmm5t) so they could use it in their awesome "invitation and survey app":http://yarp.com.
45
+
46
+ You can use it, too.
47
+
48
+ The address-finding regex may be a bit naive, I'll gladly accept pull requests that add to the test data and tests.
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('address_extractor', '0.1.4') do |p|
6
+ p.description = "Give it text. It finds addresses in it."
7
+ p.url = "http://github.com/coderifous/address_extractor"
8
+ p.author = "Jim Garvin"
9
+ p.email = "jim at thegarvin dot com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
13
+
14
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
@@ -0,0 +1,30 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{address_extractor}
3
+ s.version = "0.1.4"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Jim Garvin"]
7
+ s.date = %q{2008-11-21}
8
+ s.description = %q{Give it text. It finds addresses in it.}
9
+ s.email = %q{jim at thegarvin dot com}
10
+ s.extra_rdoc_files = ["lib/address_extractor.rb", "LICENSE.textile", "README.textile"]
11
+ s.files = ["lib/address_extractor.rb", "LICENSE.textile", "Manifest", "Rakefile", "README.textile", "test/test_address_extractor.rb", "address_extractor.gemspec", "test/test_helper.rb"]
12
+ s.has_rdoc = true
13
+ s.homepage = %q{http://github.com/coderifous/address_extractor}
14
+ s.rdoc_options = ["--line-numbers", "--title", "Address_extractor", "--main", "README.textile"]
15
+ s.require_paths = ["lib"]
16
+ s.rubyforge_project = %q{address_extractor}
17
+ s.rubygems_version = %q{1.2.0}
18
+ s.summary = %q{Give it text. It finds addresses in it.}
19
+ s.test_files = ["test/test_address_extractor.rb", "test/test_helper.rb"]
20
+
21
+ if s.respond_to? :specification_version then
22
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
23
+ s.specification_version = 2
24
+
25
+ if current_version >= 3 then
26
+ else
27
+ end
28
+ else
29
+ end
30
+ end
@@ -0,0 +1,176 @@
1
+ class AddressExtractor
2
+ class << self
3
+
4
+ # Returns hash for address if address found.
5
+ # Returns nil if no address found.
6
+ def first_address(string)
7
+ hashify_results string.scan(ADDRESS_PATTERN).first
8
+ end
9
+
10
+ # Returns array of hashes for each address found.
11
+ # Returns empty array if no addresses found.
12
+ def find_addresses(string)
13
+ string.scan(ADDRESS_PATTERN).collect { |a| hashify_results(a) }.compact
14
+ end
15
+
16
+ # Pass it a block that recieves 2 parameters:
17
+ # address hash
18
+ # matched address string ($&)
19
+ # Whatever your block returns will be used for the substition.
20
+ # Returns new string with substition applied to first identified address.
21
+ # If no address found, returns same string unaltered.
22
+ def replace_first_address(string)
23
+ hash = first_address(string)
24
+ string.sub(ADDRESS_PATTERN) do |match|
25
+ yield(hash, $&)
26
+ end
27
+ end
28
+
29
+ # Same as +replace_first_address+ but applies substition to all identified addresses.
30
+ def replace_addresses(string)
31
+ string.gsub(ADDRESS_PATTERN) do |match|
32
+ hash = hashify_results match.scan(ADDRESS_PATTERN).first
33
+ useful_address?(hash) ? yield(hash, $&) : match
34
+ end
35
+ end
36
+
37
+ private
38
+
39
+ def hashify_results(matches)
40
+ return nil if matches.nil?
41
+ result = { }
42
+ capture_index = 0
43
+ CAPTURE_MAP.each do |field|
44
+ result[field] = matches[capture_index].to_s.chomp if matches[capture_index]
45
+ capture_index += 1
46
+ end
47
+ useful_address?(result) ? result : nil
48
+ end
49
+
50
+ def useful_address?(hash)
51
+ hash &&
52
+ hash[:street1] && ( hash[:zip] || hash[:city] && hash[:state] )
53
+ end
54
+
55
+ end
56
+
57
+ CAPTURE_MAP = [ :street1, :street2, :city, :state, :zip, :zip ]
58
+
59
+ STATES = <<-EOF
60
+ ALABAMA AL
61
+ ALASKA AK
62
+ AMERICAN SAMOA AS
63
+ ARIZONA AZ
64
+ ARKANSAS AR
65
+ CALIFORNIA CA
66
+ COLORADO CO
67
+ CONNECTICUT CT
68
+ DELAWARE DE
69
+ DISTRICT OF COLUMBIA DC
70
+ FEDERATED STATES OF MICRONESIA FM
71
+ FLORIDA FL
72
+ GEORGIA GA
73
+ GUAM GU
74
+ HAWAII HI
75
+ IDAHO ID
76
+ ILLINOIS IL
77
+ INDIANA IN
78
+ IOWA IA
79
+ KANSAS KS
80
+ KENTUCKY KY
81
+ LOUISIANA LA
82
+ MAINE ME
83
+ MARSHALL ISLANDS MH
84
+ MARYLAND MD
85
+ MASSACHUSETTS MA
86
+ MICHIGAN MI
87
+ MINNESOTA MN
88
+ MISSISSIPPI MS
89
+ MISSOURI MO
90
+ MONTANA MT
91
+ NEBRASKA NE
92
+ NEVADA NV
93
+ NEW HAMPSHIRE NH
94
+ NEW JERSEY NJ
95
+ NEW MEXICO NM
96
+ NEW YORK NY
97
+ NORTH CAROLINA NC
98
+ NORTH DAKOTA ND
99
+ NORTHERN MARIANA ISLANDS MP
100
+ OHIO OH
101
+ OKLAHOMA OK
102
+ OREGON OR
103
+ PALAU PW
104
+ PENNSYLVANIA PA
105
+ PUERTO RICO PR
106
+ RHODE ISLAND RI
107
+ SOUTH CAROLINA SC
108
+ SOUTH DAKOTA SD
109
+ TENNESSEE TN
110
+ TEXAS TX
111
+ UTAH UT
112
+ VERMONT VT
113
+ VIRGIN ISLANDS VI
114
+ VIRGINIA VA
115
+ WASHINGTON WA
116
+ WEST VIRGINIA WV
117
+ WISCONSIN WI
118
+ WYOMING WY
119
+ EOF
120
+
121
+ STATE_REGEX = STATES.split(/\n/).collect{ |n| n.scan(/(\w.*\w)\s*([A-Z]{2})\s*$/) }.join("|")
122
+
123
+ SECONDARY_UNIT_DESIGNATORS = <<-EOF
124
+ APARTMENT APT
125
+ BASEMENT BSMT
126
+ BUILDING BLDG
127
+ DEPARTMENT DEPT
128
+ FLOOR FL
129
+ FRONT FRNT
130
+ HANGAR HNGR
131
+ LOBBY LBBY
132
+ LOT LOT
133
+ LOWER LOWR
134
+ OFFICE OFC
135
+ PENTHOUSE PH
136
+ PIER PIER
137
+ REAR REAR
138
+ ROOM RM
139
+ SIDE SIDE
140
+ SLIP SLIP
141
+ SPACE SPC
142
+ STOP STOP
143
+ SUITE STE
144
+ TRAILER TRLR
145
+ UNIT UNIT
146
+ UPPER UPPR
147
+ EOF
148
+
149
+ SECONDARY_UNIT_DESIGNATORS_REGEX = SECONDARY_UNIT_DESIGNATORS.split(/\n/).collect{ |n| n.scan(/(\w+)\s*(\w+)\s*$/) }.join("|")
150
+
151
+ ADDRESS_PATTERN = /
152
+ (
153
+ \d+ # A few numbers
154
+ \s+
155
+ (?:[A-Za-z'.-]+\s?){1,5} # Followed by a street name
156
+ )
157
+ \s* ,? \s* # a comma, optionally
158
+ (
159
+ (?:\d+\s+)? # a secondary unit, optionally
160
+ (?:#{SECONDARY_UNIT_DESIGNATORS_REGEX})
161
+ (?:\s+\d+)?
162
+ )?
163
+ \s* ,? \s* # a comma, optionally
164
+ (?:
165
+ (?:
166
+ ( (?:[A-Za-z]+\s?){0,2} (?:[A-Za-z]+) ) # city
167
+ \s* ,? \s* # a comma, optionally
168
+ \b(#{STATE_REGEX})\b # state
169
+ \s* ,? \s* # a comma, optionally
170
+ (\d{5})? # a zip code, optionally
171
+ )
172
+ | # or, instead of city and state
173
+ (\d{5})? # a lone zip code will do
174
+ )
175
+ /xi
176
+ end
@@ -0,0 +1,89 @@
1
+ require 'address_extractor'
2
+ require 'test_helper'
3
+ include TestDataHelper
4
+
5
+ class AddressExtractorTest < Test::Unit::TestCase
6
+ def test_first_address_extraction
7
+ each_test_data do |test_data|
8
+ address = AddressExtractor.first_address(test_data[:input])
9
+ flunk "No address found in:\n#{test_data[:input]}" if address.nil?
10
+ assert_equal_hashes test_data[:expected_output].first, address
11
+ end
12
+ end
13
+
14
+ def test_find_addresses
15
+ each_test_data do |test_data|
16
+ addresses = AddressExtractor.find_addresses(test_data[:input])
17
+ assert_equal addresses.size, test_data[:expected_output].size
18
+ test_data[:expected_output].each do |expected_output|
19
+ assert_equal_hashes expected_output, addresses.shift
20
+ end
21
+ end
22
+ end
23
+
24
+ def test_replace_first_address
25
+ string = AddressExtractor.replace_first_address(test_data.first[:input]) do |address_hash, address|
26
+ assert_equal_hashes test_data.first[:expected_output].first, address_hash
27
+ assert_match /^\s*123 Foo St., Someplace FL\s*/, address
28
+ "skidoosh"
29
+ end
30
+ assert_match /Please send the package to skidoosh/, string
31
+ end
32
+
33
+ def test_replace_addresses
34
+ string = AddressExtractor.replace_addresses(test_data.first[:input]) do |address_hash, address|
35
+ "skidoosh"
36
+ end
37
+ assert_match /Please send the package to skidoosh/, string
38
+ assert_match /via mail at:\s+skidoosh/, string
39
+ end
40
+
41
+ def test_no_addresses_found
42
+ assert_nil AddressExtractor.first_address("foo")
43
+ assert_equal [], AddressExtractor.find_addresses("foo")
44
+ assert_equal "foo", AddressExtractor.replace_first_address("foo")
45
+ assert_equal "foo", AddressExtractor.replace_addresses("foo")
46
+ end
47
+ end
48
+
49
+ # Test Input/Expected outputs defined below using test_input helper
50
+ # Expanding the tests will probably start with adding new test input
51
+
52
+ test_input "
53
+ Please send the package to 123 Foo St., Someplace FL
54
+
55
+ My phone number is 123-1234 and St. Marc of Israel can be reached
56
+ via mail at:
57
+ 123 Goob Avenue
58
+ Apt 123
59
+ Nice Town CA 12345",
60
+ { :street1 => "123 Foo St.", :street2 => nil, :city => "Someplace", :state => "FL", :zip => nil },
61
+ { :street1 => "123 Goob Avenue", :street2 => "Apt 123", :city => "Nice Town", :state => "CA", :zip => "12345" }
62
+
63
+ test_input "Let's meet tomorrow at noon at 123 Foo Bar Street, Scooby NY 12345",
64
+ { :street1 => "123 Foo Bar Street", :street2 => nil, :city => "Scooby", :state => "NY", :zip => "12345" }
65
+
66
+ test_input "Let's meet tomorrow at noon at 123 Foo Bar Street, Scooby, NY 12345",
67
+ { :street1 => "123 Foo Bar Street", :street2 => nil, :city => "Scooby", :state => "NY", :zip => "12345" }
68
+
69
+ test_input "Let's meet tomorrow at noon at 123 Foo Bar Street, Scooby, NY, 12345",
70
+ { :street1 => "123 Foo Bar Street", :street2 => nil, :city => "Scooby", :state => "NY", :zip => "12345" }
71
+
72
+ test_input "Let's meet tomorrow at noon at 123 Foo Bar Street, 12345",
73
+ { :street1 => "123 Foo Bar Street", :street2 => nil, :city => nil, :state => nil, :zip => "12345" }
74
+
75
+ test_input "
76
+ Apple Computer, Inc.
77
+ 1 Infinite Loop
78
+ Cupertino, CA 95014",
79
+ { :street1 => "1 Infinite Loop", :street2 => nil, :city => "Cupertino", :state => "CA", :zip => "95014" }
80
+
81
+ test_input "Apple Computer, Inc. 1 Infinite Loop, Cupertino, CA 95014",
82
+ { :street1 => "1 Infinite Loop", :street2 => nil, :city => "Cupertino", :state => "CA", :zip => "95014" }
83
+
84
+ test_input "Ida Lee Park Recreation Center 60 Ida Lee Dr NW, Leesburg, VA",
85
+ { :street1 => "60 Ida Lee Dr NW", :street2 => nil, :city => "Leesburg", :state => "VA", :zip => nil }
86
+
87
+ test_input "Ida Lee Park Recreation Center 60 Ida Lee Dr N West, Leesburg, VA",
88
+ { :street1 => "60 Ida Lee Dr N West", :street2 => nil, :city => "Leesburg", :state => "VA", :zip => nil }
89
+
@@ -0,0 +1,29 @@
1
+ require 'test/unit'
2
+ require 'rubygems'
3
+ begin require 'redgreen' unless ENV['TM_FILENAME']; rescue LoadError; end
4
+
5
+ module TestDataHelper
6
+ def test_input(input_string, *expected_outputs)
7
+ test_data << { :input => input_string, :expected_output => expected_outputs }
8
+ end
9
+
10
+ def each_test_data
11
+ test_data.each { |t| yield(t) }
12
+ end
13
+
14
+ def test_data
15
+ @@test_data ||= []
16
+ end
17
+ end
18
+
19
+ module Helpers
20
+ def assert_equal_hashes(expected, hash)
21
+ (expected.keys + hash.keys).uniq.each do |k|
22
+ assert_equal expected[k], hash[k], "expected[#{k.inspect}] = #{expected[k].inspect} != hash[#{k.inspect}] = #{hash[k].inspect}"
23
+ end
24
+ end
25
+ end
26
+
27
+ class Test::Unit::TestCase
28
+ include Helpers
29
+ end
metadata ADDED
@@ -0,0 +1,69 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: address_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.4
5
+ platform: ruby
6
+ authors:
7
+ - Jim Garvin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-11-21 00:00:00 -05:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Give it text. It finds addresses in it.
17
+ email: jim at thegarvin dot com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - lib/address_extractor.rb
24
+ - LICENSE.textile
25
+ - README.textile
26
+ files:
27
+ - lib/address_extractor.rb
28
+ - LICENSE.textile
29
+ - Manifest
30
+ - Rakefile
31
+ - README.textile
32
+ - test/test_address_extractor.rb
33
+ - address_extractor.gemspec
34
+ - test/test_helper.rb
35
+ has_rdoc: true
36
+ homepage: http://github.com/coderifous/address_extractor
37
+ licenses: []
38
+
39
+ post_install_message:
40
+ rdoc_options:
41
+ - --line-numbers
42
+ - --title
43
+ - Address_extractor
44
+ - --main
45
+ - README.textile
46
+ require_paths:
47
+ - lib
48
+ required_ruby_version: !ruby/object:Gem::Requirement
49
+ requirements:
50
+ - - ">="
51
+ - !ruby/object:Gem::Version
52
+ version: "0"
53
+ version:
54
+ required_rubygems_version: !ruby/object:Gem::Requirement
55
+ requirements:
56
+ - - ">="
57
+ - !ruby/object:Gem::Version
58
+ version: "1.2"
59
+ version:
60
+ requirements: []
61
+
62
+ rubyforge_project: address_extractor
63
+ rubygems_version: 1.3.5
64
+ signing_key:
65
+ specification_version: 2
66
+ summary: Give it text. It finds addresses in it.
67
+ test_files:
68
+ - test/test_address_extractor.rb
69
+ - test/test_helper.rb