coderifous-address_extractor 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
data/LICENSE.textile ADDED
@@ -0,0 +1,23 @@
1
+ h4. Copyright and License
2
+
3
+ The MIT License
4
+
5
+ Copyright (c) 2008 Jim Garvin
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,45 @@
1
+ h1. AddressExtractor
2
+
3
+ Find and/or replace mailing addresses in strings.
4
+
5
+ h2. Examples
6
+
7
+ <pre><code>
8
+ string = <<EOF
9
+ Please send the package to 123 Foo St., Someplace FL
10
+
11
+ My phone number is 123-1234 and St. Marc of Israel can be reached
12
+ via mail at:
13
+ 123 Goob Avenue
14
+ Apt 123
15
+ Nice Town CA 123456
16
+ EOF
17
+
18
+ # Find first address
19
+ AddressExtractor.first_address(string) # => { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
20
+
21
+ # Find all addresses
22
+ AddressExtractor.find_addresses(string) # =>
23
+ # [
24
+ # { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
25
+ # { :street1 => "123 Goob Avenue.", :street2 => "Apt 123", :city => "Nice Town", :state => "CA", :zip => "123456" }
26
+ # ]
27
+
28
+ # Do a gsub on first address
29
+ new_string = AddressExtractor.replace_first_address(string) do |address_hash, address_string|
30
+ map_link_to(address_string)
31
+ end
32
+
33
+ # Do a gsub on all addresses
34
+ new_string = AddressExtractor.replace_addresses(string) do |address_hash, address_string|
35
+ map_link_to(address_string)
36
+ end
37
+ </code></pre>
38
+
39
+ h3. About
40
+
41
+ Written by Jim Garvin at RubyConf '08 at the request of Chris Murphy and Ryan McGeary so they could use it in Yarp.com.
42
+
43
+ You can use it, too.
44
+
45
+ The address-finding regex may be a bit naive, I'll gladly accept pull requests that add to the test data and tests.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('address_extractor', '0.1.0') do |p|
6
+ p.description = "Give it text. It finds addresses in it."
7
+ p.url = "http://github.com/coderifous/address_extractor"
8
+ p.author = "Jim Garvin"
9
+ p.email = "jim at thegarvin dot com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
13
+
14
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
@@ -0,0 +1,30 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{address_extractor}
3
+ s.version = "0.1.0"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Jim Garvin"]
7
+ s.date = %q{2008-11-14}
8
+ s.description = %q{Give it text. It finds addresses in it.}
9
+ s.email = %q{jim at thegarvin dot com}
10
+ s.extra_rdoc_files = ["lib/address_extractor.rb", "LICENSE.textile", "README.textile"]
11
+ s.files = ["lib/address_extractor.rb", "LICENSE.textile", "Manifest", "Rakefile", "README.textile", "test/test_address_extractor.rb", "address_extractor.gemspec"]
12
+ s.has_rdoc = true
13
+ s.homepage = %q{http://github.com/coderifous/address_extractor}
14
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Address_extractor", "--main", "README.textile"]
15
+ s.require_paths = ["lib"]
16
+ s.rubyforge_project = %q{address_extractor}
17
+ s.rubygems_version = %q{1.2.0}
18
+ s.summary = %q{Give it text. It finds addresses in it.}
19
+ s.test_files = ["test/test_address_extractor.rb"]
20
+
21
+ if s.respond_to? :specification_version then
22
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
23
+ s.specification_version = 2
24
+
25
+ if current_version >= 3 then
26
+ else
27
+ end
28
+ else
29
+ end
30
+ end
@@ -0,0 +1,162 @@
1
+ class AddressExtractor
2
+ class << self
3
+
4
+ def first_address(string)
5
+ hashify_results string.scan(ADDRESS_PATTERN).first
6
+ end
7
+
8
+ def find_addresses(string)
9
+ string.scan(ADDRESS_PATTERN).collect { |a| hashify_results(a) }.compact
10
+ end
11
+
12
+ def replace_first_address(string)
13
+ hash = first_address(string)
14
+ string.sub(ADDRESS_PATTERN) do |match|
15
+ yield(hash, $&)
16
+ end
17
+ end
18
+
19
+ def replace_addresses(string)
20
+ string.gsub(ADDRESS_PATTERN) do |match|
21
+ hash = hashify_results match.scan(ADDRESS_PATTERN).first
22
+ useful_address?(hash) ? yield(hash, $&) : match
23
+ end
24
+ end
25
+
26
+ def hashify_results(matches)
27
+ result = { }
28
+ capture_index = 0
29
+ CAPTURE_MAP.each do |field|
30
+ result[field] = matches[capture_index].to_s.chomp if matches[capture_index]
31
+ capture_index += 1
32
+ end
33
+ useful_address?(result) ? result : nil
34
+ end
35
+
36
+ def useful_address?(hash)
37
+ hash &&
38
+ hash[:street1] && ( hash[:zip] || hash[:city] && hash[:state] )
39
+ end
40
+
41
+ end
42
+
43
+ CAPTURE_MAP = [ :street1, :street2, :city, :state, :zip, :zip ]
44
+
45
+ STATES = <<-EOF
46
+ ALABAMA AL
47
+ ALASKA AK
48
+ AMERICAN SAMOA AS
49
+ ARIZONA AZ
50
+ ARKANSAS AR
51
+ CALIFORNIA CA
52
+ COLORADO CO
53
+ CONNECTICUT CT
54
+ DELAWARE DE
55
+ DISTRICT OF COLUMBIA DC
56
+ FEDERATED STATES OF MICRONESIA FM
57
+ FLORIDA FL
58
+ GEORGIA GA
59
+ GUAM GU
60
+ HAWAII HI
61
+ IDAHO ID
62
+ ILLINOIS IL
63
+ INDIANA IN
64
+ IOWA IA
65
+ KANSAS KS
66
+ KENTUCKY KY
67
+ LOUISIANA LA
68
+ MAINE ME
69
+ MARSHALL ISLANDS MH
70
+ MARYLAND MD
71
+ MASSACHUSETTS MA
72
+ MICHIGAN MI
73
+ MINNESOTA MN
74
+ MISSISSIPPI MS
75
+ MISSOURI MO
76
+ MONTANA MT
77
+ NEBRASKA NE
78
+ NEVADA NV
79
+ NEW HAMPSHIRE NH
80
+ NEW JERSEY NJ
81
+ NEW MEXICO NM
82
+ NEW YORK NY
83
+ NORTH CAROLINA NC
84
+ NORTH DAKOTA ND
85
+ NORTHERN MARIANA ISLANDS MP
86
+ OHIO OH
87
+ OKLAHOMA OK
88
+ OREGON OR
89
+ PALAU PW
90
+ PENNSYLVANIA PA
91
+ PUERTO RICO PR
92
+ RHODE ISLAND RI
93
+ SOUTH CAROLINA SC
94
+ SOUTH DAKOTA SD
95
+ TENNESSEE TN
96
+ TEXAS TX
97
+ UTAH UT
98
+ VERMONT VT
99
+ VIRGIN ISLANDS VI
100
+ VIRGINIA VA
101
+ WASHINGTON WA
102
+ WEST VIRGINIA WV
103
+ WISCONSIN WI
104
+ WYOMING WY
105
+ EOF
106
+
107
+ STATE_REGEX = STATES.split(/\n/).collect{ |n| n.scan(/(\w.*\w)\s*([A-Z]{2})\s*$/) }.join("|")
108
+
109
+ SECONDARY_UNIT_DESIGNATORS = <<-EOF
110
+ APARTMENT APT
111
+ BASEMENT BSMT
112
+ BUILDING BLDG
113
+ DEPARTMENT DEPT
114
+ FLOOR FL
115
+ FRONT FRNT
116
+ HANGAR HNGR
117
+ LOBBY LBBY
118
+ LOT LOT
119
+ LOWER LOWR
120
+ OFFICE OFC
121
+ PENTHOUSE PH
122
+ PIER PIER
123
+ REAR REAR
124
+ ROOM RM
125
+ SIDE SIDE
126
+ SLIP SLIP
127
+ SPACE SPC
128
+ STOP STOP
129
+ SUITE STE
130
+ TRAILER TRLR
131
+ UNIT UNIT
132
+ UPPER UPPR
133
+ EOF
134
+
135
+ SECONDARY_UNIT_DESIGNATORS_REGEX = SECONDARY_UNIT_DESIGNATORS.split(/\n/).collect{ |n| n.scan(/(\w+)\s*(\w+)\s*$/) }.join("|")
136
+
137
+ ADDRESS_PATTERN = /
138
+ (
139
+ \d+ # A few numbers
140
+ \s+
141
+ (?:[A-Za-z'.-]+\s?){1,3} # Followed by a street name
142
+ )
143
+ \s* ,? \s*
144
+ (
145
+ (?:\d+\s+)? # a secondary unit, optionally
146
+ (?:#{SECONDARY_UNIT_DESIGNATORS_REGEX})
147
+ (?:\s+\d+)?
148
+ )?
149
+ \s* ,? \s* # a comma, optionally
150
+ (?:
151
+ (?:
152
+ ((?:[A-Za-z]+\s?){1,3}) # city
153
+ \s+
154
+ \b(#{STATE_REGEX})\b # state
155
+ \s* ,? \s* # a comma, optionally
156
+ (\d{6})? # a zip code, optionally
157
+ )
158
+ | # or, instead of city and state
159
+ (\d{6})? # a lone zip code will do
160
+ )
161
+ /xi
162
+ end
@@ -0,0 +1,71 @@
1
+ $: << File.dirname(__FILE__)+"/../lib"
2
+
3
+ require 'test/unit'
4
+ require 'address_extractor.rb'
5
+
6
+ class AddressExtractorTest < Test::Unit::TestCase
7
+
8
+ def test_first_address_extraction
9
+ address = AddressExtractor.first_address(DATA1)
10
+ assert_first_address(address)
11
+ end
12
+
13
+ def test_find_addresses
14
+ addresses = AddressExtractor.find_addresses(DATA1)
15
+ assert_first_address addresses[0]
16
+ assert_second_address addresses[1]
17
+ end
18
+
19
+ def test_replace_first_address
20
+ string = AddressExtractor.replace_first_address(DATA1) do |address_hash, address|
21
+ assert_first_address address_hash
22
+ assert_first_address_string address
23
+ "skidoosh"
24
+ end
25
+ assert string =~ /Please send the package to skidoosh/
26
+ end
27
+
28
+ def test_replace_addresses
29
+ string = AddressExtractor.replace_addresses(DATA1) do |address_hash, address|
30
+ "skidoosh"
31
+ end
32
+ assert string =~ /Please send the package to skidoosh/
33
+ assert string =~ /via mail at:\n skidoosh/
34
+ end
35
+
36
+ module Helpers
37
+ def assert_first_address(a)
38
+ assert_not_nil a
39
+ assert_equal "123 Foo St.", a[:street1]
40
+ assert_equal nil, a[:street2]
41
+ assert_equal "Someplace", a[:city]
42
+ assert_equal "FL", a[:state]
43
+ assert_equal nil, a[:zip]
44
+ end
45
+
46
+ def assert_first_address_string(string)
47
+ assert_match /^123 Foo St\., Someplace FL\s*$/, string
48
+ end
49
+
50
+
51
+ def assert_second_address(a)
52
+ assert_not_nil a
53
+ assert_equal "123 Goob Avenue", a[:street1]
54
+ assert_equal "Apt 123", a[:street2]
55
+ assert_equal "Nice Town", a[:city]
56
+ assert_equal "CA", a[:state]
57
+ assert_equal "123456", a[:zip]
58
+ end
59
+ end
60
+ include Helpers
61
+ end
62
+
63
+ DATA1 = <<EOF
64
+ Please send the package to 123 Foo St., Someplace FL
65
+
66
+ My phone number is 123-1234 and St. Marc of Israel can be reached
67
+ via mail at:
68
+ 123 Goob Avenue
69
+ Apt 123
70
+ Nice Town CA 123456
71
+ EOF
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: coderifous-address_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jim Garvin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-11-14 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Give it text. It finds addresses in it.
17
+ email: jim at thegarvin dot com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - lib/address_extractor.rb
24
+ - LICENSE.textile
25
+ - README.textile
26
+ files:
27
+ - lib/address_extractor.rb
28
+ - LICENSE.textile
29
+ - Manifest
30
+ - Rakefile
31
+ - README.textile
32
+ - test/test_address_extractor.rb
33
+ - address_extractor.gemspec
34
+ has_rdoc: true
35
+ homepage: http://github.com/coderifous/address_extractor
36
+ post_install_message:
37
+ rdoc_options:
38
+ - --line-numbers
39
+ - --inline-source
40
+ - --title
41
+ - Address_extractor
42
+ - --main
43
+ - README.textile
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "1.2"
57
+ version:
58
+ requirements: []
59
+
60
+ rubyforge_project: address_extractor
61
+ rubygems_version: 1.2.0
62
+ signing_key:
63
+ specification_version: 2
64
+ summary: Give it text. It finds addresses in it.
65
+ test_files:
66
+ - test/test_address_extractor.rb