coderifous-address_extractor 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/LICENSE.textile +23 -0
- data/README.textile +45 -0
- data/Rakefile +14 -0
- data/address_extractor.gemspec +30 -0
- data/lib/address_extractor.rb +162 -0
- data/test/test_address_extractor.rb +71 -0
- metadata +66 -0
data/LICENSE.textile
ADDED
@@ -0,0 +1,23 @@
|
|
1
|
+
h4. Copyright and License
|
2
|
+
|
3
|
+
The MIT License
|
4
|
+
|
5
|
+
Copyright (c) 2008 Jim Garvin
|
6
|
+
|
7
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
8
|
+
of this software and associated documentation files (the "Software"), to deal
|
9
|
+
in the Software without restriction, including without limitation the rights
|
10
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
11
|
+
copies of the Software, and to permit persons to whom the Software is
|
12
|
+
furnished to do so, subject to the following conditions:
|
13
|
+
|
14
|
+
The above copyright notice and this permission notice shall be included in
|
15
|
+
all copies or substantial portions of the Software.
|
16
|
+
|
17
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
18
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
19
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
20
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
21
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
22
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
23
|
+
THE SOFTWARE.
|
data/README.textile
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
h1. AddressExtractor
|
2
|
+
|
3
|
+
Find and/or replace mailing addresses in strings.
|
4
|
+
|
5
|
+
h2. Examples
|
6
|
+
|
7
|
+
<pre><code>
|
8
|
+
string = <<EOF
|
9
|
+
Please send the package to 123 Foo St., Someplace FL
|
10
|
+
|
11
|
+
My phone number is 123-1234 and St. Marc of Israel can be reached
|
12
|
+
via mail at:
|
13
|
+
123 Goob Avenue
|
14
|
+
Apt 123
|
15
|
+
Nice Town CA 123456
|
16
|
+
EOF
|
17
|
+
|
18
|
+
# Find first address
|
19
|
+
AddressExtractor.first_address(string) # => { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
|
20
|
+
|
21
|
+
# Find all addresses
|
22
|
+
AddressExtractor.find_addresses(string) # =>
|
23
|
+
# [
|
24
|
+
# { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
|
25
|
+
# { :street1 => "123 Goob Avenue.", :street2 => "Apt 123", :city => "Nice Town", :state => "CA", :zip => "123456" }
|
26
|
+
# ]
|
27
|
+
|
28
|
+
# Do a gsub on first address
|
29
|
+
new_string = AddressExtractor.replace_first_address(string) do |address_hash, address_string|
|
30
|
+
map_link_to(address_string)
|
31
|
+
end
|
32
|
+
|
33
|
+
# Do a gsub on all addresses
|
34
|
+
new_string = AddressExtractor.replace_addresses(string) do |address_hash, address_string|
|
35
|
+
map_link_to(address_string)
|
36
|
+
end
|
37
|
+
</code></pre>
|
38
|
+
|
39
|
+
h3. About
|
40
|
+
|
41
|
+
Written by Jim Garvin at RubyConf '08 at the request of Chris Murphy and Ryan McGeary so they could use it in Yarp.com.
|
42
|
+
|
43
|
+
You can use it, too.
|
44
|
+
|
45
|
+
The address-finding regex may be a bit naive, I'll gladly accept pull requests that add to the test data and tests.
|
data/Rakefile
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
require 'rubygems'
|
2
|
+
require 'rake'
|
3
|
+
require 'echoe'
|
4
|
+
|
5
|
+
Echoe.new('address_extractor', '0.1.0') do |p|
|
6
|
+
p.description = "Give it text. It finds addresses in it."
|
7
|
+
p.url = "http://github.com/coderifous/address_extractor"
|
8
|
+
p.author = "Jim Garvin"
|
9
|
+
p.email = "jim at thegarvin dot com"
|
10
|
+
p.ignore_pattern = ["tmp/*", "script/*"]
|
11
|
+
p.development_dependencies = []
|
12
|
+
end
|
13
|
+
|
14
|
+
Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
|
@@ -0,0 +1,30 @@
|
|
1
|
+
Gem::Specification.new do |s|
|
2
|
+
s.name = %q{address_extractor}
|
3
|
+
s.version = "0.1.0"
|
4
|
+
|
5
|
+
s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
|
6
|
+
s.authors = ["Jim Garvin"]
|
7
|
+
s.date = %q{2008-11-14}
|
8
|
+
s.description = %q{Give it text. It finds addresses in it.}
|
9
|
+
s.email = %q{jim at thegarvin dot com}
|
10
|
+
s.extra_rdoc_files = ["lib/address_extractor.rb", "LICENSE.textile", "README.textile"]
|
11
|
+
s.files = ["lib/address_extractor.rb", "LICENSE.textile", "Manifest", "Rakefile", "README.textile", "test/test_address_extractor.rb", "address_extractor.gemspec"]
|
12
|
+
s.has_rdoc = true
|
13
|
+
s.homepage = %q{http://github.com/coderifous/address_extractor}
|
14
|
+
s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Address_extractor", "--main", "README.textile"]
|
15
|
+
s.require_paths = ["lib"]
|
16
|
+
s.rubyforge_project = %q{address_extractor}
|
17
|
+
s.rubygems_version = %q{1.2.0}
|
18
|
+
s.summary = %q{Give it text. It finds addresses in it.}
|
19
|
+
s.test_files = ["test/test_address_extractor.rb"]
|
20
|
+
|
21
|
+
if s.respond_to? :specification_version then
|
22
|
+
current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
|
23
|
+
s.specification_version = 2
|
24
|
+
|
25
|
+
if current_version >= 3 then
|
26
|
+
else
|
27
|
+
end
|
28
|
+
else
|
29
|
+
end
|
30
|
+
end
|
@@ -0,0 +1,162 @@
|
|
1
|
+
class AddressExtractor
|
2
|
+
class << self
|
3
|
+
|
4
|
+
def first_address(string)
|
5
|
+
hashify_results string.scan(ADDRESS_PATTERN).first
|
6
|
+
end
|
7
|
+
|
8
|
+
def find_addresses(string)
|
9
|
+
string.scan(ADDRESS_PATTERN).collect { |a| hashify_results(a) }.compact
|
10
|
+
end
|
11
|
+
|
12
|
+
def replace_first_address(string)
|
13
|
+
hash = first_address(string)
|
14
|
+
string.sub(ADDRESS_PATTERN) do |match|
|
15
|
+
yield(hash, $&)
|
16
|
+
end
|
17
|
+
end
|
18
|
+
|
19
|
+
def replace_addresses(string)
|
20
|
+
string.gsub(ADDRESS_PATTERN) do |match|
|
21
|
+
hash = hashify_results match.scan(ADDRESS_PATTERN).first
|
22
|
+
useful_address?(hash) ? yield(hash, $&) : match
|
23
|
+
end
|
24
|
+
end
|
25
|
+
|
26
|
+
def hashify_results(matches)
|
27
|
+
result = { }
|
28
|
+
capture_index = 0
|
29
|
+
CAPTURE_MAP.each do |field|
|
30
|
+
result[field] = matches[capture_index].to_s.chomp if matches[capture_index]
|
31
|
+
capture_index += 1
|
32
|
+
end
|
33
|
+
useful_address?(result) ? result : nil
|
34
|
+
end
|
35
|
+
|
36
|
+
def useful_address?(hash)
|
37
|
+
hash &&
|
38
|
+
hash[:street1] && ( hash[:zip] || hash[:city] && hash[:state] )
|
39
|
+
end
|
40
|
+
|
41
|
+
end
|
42
|
+
|
43
|
+
CAPTURE_MAP = [ :street1, :street2, :city, :state, :zip, :zip ]
|
44
|
+
|
45
|
+
STATES = <<-EOF
|
46
|
+
ALABAMA AL
|
47
|
+
ALASKA AK
|
48
|
+
AMERICAN SAMOA AS
|
49
|
+
ARIZONA AZ
|
50
|
+
ARKANSAS AR
|
51
|
+
CALIFORNIA CA
|
52
|
+
COLORADO CO
|
53
|
+
CONNECTICUT CT
|
54
|
+
DELAWARE DE
|
55
|
+
DISTRICT OF COLUMBIA DC
|
56
|
+
FEDERATED STATES OF MICRONESIA FM
|
57
|
+
FLORIDA FL
|
58
|
+
GEORGIA GA
|
59
|
+
GUAM GU
|
60
|
+
HAWAII HI
|
61
|
+
IDAHO ID
|
62
|
+
ILLINOIS IL
|
63
|
+
INDIANA IN
|
64
|
+
IOWA IA
|
65
|
+
KANSAS KS
|
66
|
+
KENTUCKY KY
|
67
|
+
LOUISIANA LA
|
68
|
+
MAINE ME
|
69
|
+
MARSHALL ISLANDS MH
|
70
|
+
MARYLAND MD
|
71
|
+
MASSACHUSETTS MA
|
72
|
+
MICHIGAN MI
|
73
|
+
MINNESOTA MN
|
74
|
+
MISSISSIPPI MS
|
75
|
+
MISSOURI MO
|
76
|
+
MONTANA MT
|
77
|
+
NEBRASKA NE
|
78
|
+
NEVADA NV
|
79
|
+
NEW HAMPSHIRE NH
|
80
|
+
NEW JERSEY NJ
|
81
|
+
NEW MEXICO NM
|
82
|
+
NEW YORK NY
|
83
|
+
NORTH CAROLINA NC
|
84
|
+
NORTH DAKOTA ND
|
85
|
+
NORTHERN MARIANA ISLANDS MP
|
86
|
+
OHIO OH
|
87
|
+
OKLAHOMA OK
|
88
|
+
OREGON OR
|
89
|
+
PALAU PW
|
90
|
+
PENNSYLVANIA PA
|
91
|
+
PUERTO RICO PR
|
92
|
+
RHODE ISLAND RI
|
93
|
+
SOUTH CAROLINA SC
|
94
|
+
SOUTH DAKOTA SD
|
95
|
+
TENNESSEE TN
|
96
|
+
TEXAS TX
|
97
|
+
UTAH UT
|
98
|
+
VERMONT VT
|
99
|
+
VIRGIN ISLANDS VI
|
100
|
+
VIRGINIA VA
|
101
|
+
WASHINGTON WA
|
102
|
+
WEST VIRGINIA WV
|
103
|
+
WISCONSIN WI
|
104
|
+
WYOMING WY
|
105
|
+
EOF
|
106
|
+
|
107
|
+
STATE_REGEX = STATES.split(/\n/).collect{ |n| n.scan(/(\w.*\w)\s*([A-Z]{2})\s*$/) }.join("|")
|
108
|
+
|
109
|
+
SECONDARY_UNIT_DESIGNATORS = <<-EOF
|
110
|
+
APARTMENT APT
|
111
|
+
BASEMENT BSMT
|
112
|
+
BUILDING BLDG
|
113
|
+
DEPARTMENT DEPT
|
114
|
+
FLOOR FL
|
115
|
+
FRONT FRNT
|
116
|
+
HANGAR HNGR
|
117
|
+
LOBBY LBBY
|
118
|
+
LOT LOT
|
119
|
+
LOWER LOWR
|
120
|
+
OFFICE OFC
|
121
|
+
PENTHOUSE PH
|
122
|
+
PIER PIER
|
123
|
+
REAR REAR
|
124
|
+
ROOM RM
|
125
|
+
SIDE SIDE
|
126
|
+
SLIP SLIP
|
127
|
+
SPACE SPC
|
128
|
+
STOP STOP
|
129
|
+
SUITE STE
|
130
|
+
TRAILER TRLR
|
131
|
+
UNIT UNIT
|
132
|
+
UPPER UPPR
|
133
|
+
EOF
|
134
|
+
|
135
|
+
SECONDARY_UNIT_DESIGNATORS_REGEX = SECONDARY_UNIT_DESIGNATORS.split(/\n/).collect{ |n| n.scan(/(\w+)\s*(\w+)\s*$/) }.join("|")
|
136
|
+
|
137
|
+
ADDRESS_PATTERN = /
|
138
|
+
(
|
139
|
+
\d+ # A few numbers
|
140
|
+
\s+
|
141
|
+
(?:[A-Za-z'.-]+\s?){1,3} # Followed by a street name
|
142
|
+
)
|
143
|
+
\s* ,? \s*
|
144
|
+
(
|
145
|
+
(?:\d+\s+)? # a secondary unit, optionally
|
146
|
+
(?:#{SECONDARY_UNIT_DESIGNATORS_REGEX})
|
147
|
+
(?:\s+\d+)?
|
148
|
+
)?
|
149
|
+
\s* ,? \s* # a comma, optionally
|
150
|
+
(?:
|
151
|
+
(?:
|
152
|
+
((?:[A-Za-z]+\s?){1,3}) # city
|
153
|
+
\s+
|
154
|
+
\b(#{STATE_REGEX})\b # state
|
155
|
+
\s* ,? \s* # a comma, optionally
|
156
|
+
(\d{6})? # a zip code, optionally
|
157
|
+
)
|
158
|
+
| # or, instead of city and state
|
159
|
+
(\d{6})? # a lone zip code will do
|
160
|
+
)
|
161
|
+
/xi
|
162
|
+
end
|
@@ -0,0 +1,71 @@
|
|
1
|
+
$: << File.dirname(__FILE__)+"/../lib"
|
2
|
+
|
3
|
+
require 'test/unit'
|
4
|
+
require 'address_extractor.rb'
|
5
|
+
|
6
|
+
class AddressExtractorTest < Test::Unit::TestCase
|
7
|
+
|
8
|
+
def test_first_address_extraction
|
9
|
+
address = AddressExtractor.first_address(DATA1)
|
10
|
+
assert_first_address(address)
|
11
|
+
end
|
12
|
+
|
13
|
+
def test_find_addresses
|
14
|
+
addresses = AddressExtractor.find_addresses(DATA1)
|
15
|
+
assert_first_address addresses[0]
|
16
|
+
assert_second_address addresses[1]
|
17
|
+
end
|
18
|
+
|
19
|
+
def test_replace_first_address
|
20
|
+
string = AddressExtractor.replace_first_address(DATA1) do |address_hash, address|
|
21
|
+
assert_first_address address_hash
|
22
|
+
assert_first_address_string address
|
23
|
+
"skidoosh"
|
24
|
+
end
|
25
|
+
assert string =~ /Please send the package to skidoosh/
|
26
|
+
end
|
27
|
+
|
28
|
+
def test_replace_addresses
|
29
|
+
string = AddressExtractor.replace_addresses(DATA1) do |address_hash, address|
|
30
|
+
"skidoosh"
|
31
|
+
end
|
32
|
+
assert string =~ /Please send the package to skidoosh/
|
33
|
+
assert string =~ /via mail at:\n skidoosh/
|
34
|
+
end
|
35
|
+
|
36
|
+
module Helpers
|
37
|
+
def assert_first_address(a)
|
38
|
+
assert_not_nil a
|
39
|
+
assert_equal "123 Foo St.", a[:street1]
|
40
|
+
assert_equal nil, a[:street2]
|
41
|
+
assert_equal "Someplace", a[:city]
|
42
|
+
assert_equal "FL", a[:state]
|
43
|
+
assert_equal nil, a[:zip]
|
44
|
+
end
|
45
|
+
|
46
|
+
def assert_first_address_string(string)
|
47
|
+
assert_match /^123 Foo St\., Someplace FL\s*$/, string
|
48
|
+
end
|
49
|
+
|
50
|
+
|
51
|
+
def assert_second_address(a)
|
52
|
+
assert_not_nil a
|
53
|
+
assert_equal "123 Goob Avenue", a[:street1]
|
54
|
+
assert_equal "Apt 123", a[:street2]
|
55
|
+
assert_equal "Nice Town", a[:city]
|
56
|
+
assert_equal "CA", a[:state]
|
57
|
+
assert_equal "123456", a[:zip]
|
58
|
+
end
|
59
|
+
end
|
60
|
+
include Helpers
|
61
|
+
end
|
62
|
+
|
63
|
+
DATA1 = <<EOF
|
64
|
+
Please send the package to 123 Foo St., Someplace FL
|
65
|
+
|
66
|
+
My phone number is 123-1234 and St. Marc of Israel can be reached
|
67
|
+
via mail at:
|
68
|
+
123 Goob Avenue
|
69
|
+
Apt 123
|
70
|
+
Nice Town CA 123456
|
71
|
+
EOF
|
metadata
ADDED
@@ -0,0 +1,66 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: coderifous-address_extractor
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.1.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jim Garvin
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
|
12
|
+
date: 2008-11-14 00:00:00 -08:00
|
13
|
+
default_executable:
|
14
|
+
dependencies: []
|
15
|
+
|
16
|
+
description: Give it text. It finds addresses in it.
|
17
|
+
email: jim at thegarvin dot com
|
18
|
+
executables: []
|
19
|
+
|
20
|
+
extensions: []
|
21
|
+
|
22
|
+
extra_rdoc_files:
|
23
|
+
- lib/address_extractor.rb
|
24
|
+
- LICENSE.textile
|
25
|
+
- README.textile
|
26
|
+
files:
|
27
|
+
- lib/address_extractor.rb
|
28
|
+
- LICENSE.textile
|
29
|
+
- Manifest
|
30
|
+
- Rakefile
|
31
|
+
- README.textile
|
32
|
+
- test/test_address_extractor.rb
|
33
|
+
- address_extractor.gemspec
|
34
|
+
has_rdoc: true
|
35
|
+
homepage: http://github.com/coderifous/address_extractor
|
36
|
+
post_install_message:
|
37
|
+
rdoc_options:
|
38
|
+
- --line-numbers
|
39
|
+
- --inline-source
|
40
|
+
- --title
|
41
|
+
- Address_extractor
|
42
|
+
- --main
|
43
|
+
- README.textile
|
44
|
+
require_paths:
|
45
|
+
- lib
|
46
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
47
|
+
requirements:
|
48
|
+
- - ">="
|
49
|
+
- !ruby/object:Gem::Version
|
50
|
+
version: "0"
|
51
|
+
version:
|
52
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
53
|
+
requirements:
|
54
|
+
- - ">="
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: "1.2"
|
57
|
+
version:
|
58
|
+
requirements: []
|
59
|
+
|
60
|
+
rubyforge_project: address_extractor
|
61
|
+
rubygems_version: 1.2.0
|
62
|
+
signing_key:
|
63
|
+
specification_version: 2
|
64
|
+
summary: Give it text. It finds addresses in it.
|
65
|
+
test_files:
|
66
|
+
- test/test_address_extractor.rb
|