coderifous-address_extractor 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/LICENSE.textile ADDED
@@ -0,0 +1,23 @@
1
+ h4. Copyright and License
2
+
3
+ The MIT License
4
+
5
+ Copyright (c) 2008 Jim Garvin
6
+
7
+ Permission is hereby granted, free of charge, to any person obtaining a copy
8
+ of this software and associated documentation files (the "Software"), to deal
9
+ in the Software without restriction, including without limitation the rights
10
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11
+ copies of the Software, and to permit persons to whom the Software is
12
+ furnished to do so, subject to the following conditions:
13
+
14
+ The above copyright notice and this permission notice shall be included in
15
+ all copies or substantial portions of the Software.
16
+
17
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
23
+ THE SOFTWARE.
data/README.textile ADDED
@@ -0,0 +1,45 @@
1
+ h1. AddressExtractor
2
+
3
+ Find and/or replace mailing addresses in strings.
4
+
5
+ h2. Examples
6
+
7
+ <pre><code>
8
+ string = <<EOF
9
+ Please send the package to 123 Foo St., Someplace FL
10
+
11
+ My phone number is 123-1234 and St. Marc of Israel can be reached
12
+ via mail at:
13
+ 123 Goob Avenue
14
+ Apt 123
15
+ Nice Town CA 123456
16
+ EOF
17
+
18
+ # Find first address
19
+ AddressExtractor.first_address(string) # => { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
20
+
21
+ # Find all addresses
22
+ AddressExtractor.find_addresses(string) # =>
23
+ # [
24
+ # { :street1 => "123 Foo St.", :city => "Someplace", :state => "FL" }
25
+ # { :street1 => "123 Goob Avenue.", :street2 => "Apt 123", :city => "Nice Town", :state => "CA", :zip => "123456" }
26
+ # ]
27
+
28
+ # Do a gsub on first address
29
+ new_string = AddressExtractor.replace_first_address(string) do |address_hash, address_string|
30
+ map_link_to(address_string)
31
+ end
32
+
33
+ # Do a gsub on all addresses
34
+ new_string = AddressExtractor.replace_addresses(string) do |address_hash, address_string|
35
+ map_link_to(address_string)
36
+ end
37
+ </code></pre>
38
+
39
+ h3. About
40
+
41
+ Written by Jim Garvin at RubyConf '08 at the request of Chris Murphy and Ryan McGeary so they could use it in Yarp.com.
42
+
43
+ You can use it, too.
44
+
45
+ The address-finding regex may be a bit naive, I'll gladly accept pull requests that add to the test data and tests.
data/Rakefile ADDED
@@ -0,0 +1,14 @@
1
+ require 'rubygems'
2
+ require 'rake'
3
+ require 'echoe'
4
+
5
+ Echoe.new('address_extractor', '0.1.0') do |p|
6
+ p.description = "Give it text. It finds addresses in it."
7
+ p.url = "http://github.com/coderifous/address_extractor"
8
+ p.author = "Jim Garvin"
9
+ p.email = "jim at thegarvin dot com"
10
+ p.ignore_pattern = ["tmp/*", "script/*"]
11
+ p.development_dependencies = []
12
+ end
13
+
14
+ Dir["#{File.dirname(__FILE__)}/tasks/*.rake"].sort.each { |ext| load ext }
@@ -0,0 +1,30 @@
1
+ Gem::Specification.new do |s|
2
+ s.name = %q{address_extractor}
3
+ s.version = "0.1.0"
4
+
5
+ s.required_rubygems_version = Gem::Requirement.new(">= 1.2") if s.respond_to? :required_rubygems_version=
6
+ s.authors = ["Jim Garvin"]
7
+ s.date = %q{2008-11-14}
8
+ s.description = %q{Give it text. It finds addresses in it.}
9
+ s.email = %q{jim at thegarvin dot com}
10
+ s.extra_rdoc_files = ["lib/address_extractor.rb", "LICENSE.textile", "README.textile"]
11
+ s.files = ["lib/address_extractor.rb", "LICENSE.textile", "Manifest", "Rakefile", "README.textile", "test/test_address_extractor.rb", "address_extractor.gemspec"]
12
+ s.has_rdoc = true
13
+ s.homepage = %q{http://github.com/coderifous/address_extractor}
14
+ s.rdoc_options = ["--line-numbers", "--inline-source", "--title", "Address_extractor", "--main", "README.textile"]
15
+ s.require_paths = ["lib"]
16
+ s.rubyforge_project = %q{address_extractor}
17
+ s.rubygems_version = %q{1.2.0}
18
+ s.summary = %q{Give it text. It finds addresses in it.}
19
+ s.test_files = ["test/test_address_extractor.rb"]
20
+
21
+ if s.respond_to? :specification_version then
22
+ current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
23
+ s.specification_version = 2
24
+
25
+ if current_version >= 3 then
26
+ else
27
+ end
28
+ else
29
+ end
30
+ end
@@ -0,0 +1,162 @@
1
+ class AddressExtractor
2
+ class << self
3
+
4
+ def first_address(string)
5
+ hashify_results string.scan(ADDRESS_PATTERN).first
6
+ end
7
+
8
+ def find_addresses(string)
9
+ string.scan(ADDRESS_PATTERN).collect { |a| hashify_results(a) }.compact
10
+ end
11
+
12
+ def replace_first_address(string)
13
+ hash = first_address(string)
14
+ string.sub(ADDRESS_PATTERN) do |match|
15
+ yield(hash, $&)
16
+ end
17
+ end
18
+
19
+ def replace_addresses(string)
20
+ string.gsub(ADDRESS_PATTERN) do |match|
21
+ hash = hashify_results match.scan(ADDRESS_PATTERN).first
22
+ useful_address?(hash) ? yield(hash, $&) : match
23
+ end
24
+ end
25
+
26
+ def hashify_results(matches)
27
+ result = { }
28
+ capture_index = 0
29
+ CAPTURE_MAP.each do |field|
30
+ result[field] = matches[capture_index].to_s.chomp if matches[capture_index]
31
+ capture_index += 1
32
+ end
33
+ useful_address?(result) ? result : nil
34
+ end
35
+
36
+ def useful_address?(hash)
37
+ hash &&
38
+ hash[:street1] && ( hash[:zip] || hash[:city] && hash[:state] )
39
+ end
40
+
41
+ end
42
+
43
+ CAPTURE_MAP = [ :street1, :street2, :city, :state, :zip, :zip ]
44
+
45
+ STATES = <<-EOF
46
+ ALABAMA AL
47
+ ALASKA AK
48
+ AMERICAN SAMOA AS
49
+ ARIZONA AZ
50
+ ARKANSAS AR
51
+ CALIFORNIA CA
52
+ COLORADO CO
53
+ CONNECTICUT CT
54
+ DELAWARE DE
55
+ DISTRICT OF COLUMBIA DC
56
+ FEDERATED STATES OF MICRONESIA FM
57
+ FLORIDA FL
58
+ GEORGIA GA
59
+ GUAM GU
60
+ HAWAII HI
61
+ IDAHO ID
62
+ ILLINOIS IL
63
+ INDIANA IN
64
+ IOWA IA
65
+ KANSAS KS
66
+ KENTUCKY KY
67
+ LOUISIANA LA
68
+ MAINE ME
69
+ MARSHALL ISLANDS MH
70
+ MARYLAND MD
71
+ MASSACHUSETTS MA
72
+ MICHIGAN MI
73
+ MINNESOTA MN
74
+ MISSISSIPPI MS
75
+ MISSOURI MO
76
+ MONTANA MT
77
+ NEBRASKA NE
78
+ NEVADA NV
79
+ NEW HAMPSHIRE NH
80
+ NEW JERSEY NJ
81
+ NEW MEXICO NM
82
+ NEW YORK NY
83
+ NORTH CAROLINA NC
84
+ NORTH DAKOTA ND
85
+ NORTHERN MARIANA ISLANDS MP
86
+ OHIO OH
87
+ OKLAHOMA OK
88
+ OREGON OR
89
+ PALAU PW
90
+ PENNSYLVANIA PA
91
+ PUERTO RICO PR
92
+ RHODE ISLAND RI
93
+ SOUTH CAROLINA SC
94
+ SOUTH DAKOTA SD
95
+ TENNESSEE TN
96
+ TEXAS TX
97
+ UTAH UT
98
+ VERMONT VT
99
+ VIRGIN ISLANDS VI
100
+ VIRGINIA VA
101
+ WASHINGTON WA
102
+ WEST VIRGINIA WV
103
+ WISCONSIN WI
104
+ WYOMING WY
105
+ EOF
106
+
107
+ STATE_REGEX = STATES.split(/\n/).collect{ |n| n.scan(/(\w.*\w)\s*([A-Z]{2})\s*$/) }.join("|")
108
+
109
+ SECONDARY_UNIT_DESIGNATORS = <<-EOF
110
+ APARTMENT APT
111
+ BASEMENT BSMT
112
+ BUILDING BLDG
113
+ DEPARTMENT DEPT
114
+ FLOOR FL
115
+ FRONT FRNT
116
+ HANGAR HNGR
117
+ LOBBY LBBY
118
+ LOT LOT
119
+ LOWER LOWR
120
+ OFFICE OFC
121
+ PENTHOUSE PH
122
+ PIER PIER
123
+ REAR REAR
124
+ ROOM RM
125
+ SIDE SIDE
126
+ SLIP SLIP
127
+ SPACE SPC
128
+ STOP STOP
129
+ SUITE STE
130
+ TRAILER TRLR
131
+ UNIT UNIT
132
+ UPPER UPPR
133
+ EOF
134
+
135
+ SECONDARY_UNIT_DESIGNATORS_REGEX = SECONDARY_UNIT_DESIGNATORS.split(/\n/).collect{ |n| n.scan(/(\w+)\s*(\w+)\s*$/) }.join("|")
136
+
137
+ ADDRESS_PATTERN = /
138
+ (
139
+ \d+ # A few numbers
140
+ \s+
141
+ (?:[A-Za-z'.-]+\s?){1,3} # Followed by a street name
142
+ )
143
+ \s* ,? \s*
144
+ (
145
+ (?:\d+\s+)? # a secondary unit, optionally
146
+ (?:#{SECONDARY_UNIT_DESIGNATORS_REGEX})
147
+ (?:\s+\d+)?
148
+ )?
149
+ \s* ,? \s* # a comma, optionally
150
+ (?:
151
+ (?:
152
+ ((?:[A-Za-z]+\s?){1,3}) # city
153
+ \s+
154
+ \b(#{STATE_REGEX})\b # state
155
+ \s* ,? \s* # a comma, optionally
156
+ (\d{6})? # a zip code, optionally
157
+ )
158
+ | # or, instead of city and state
159
+ (\d{6})? # a lone zip code will do
160
+ )
161
+ /xi
162
+ end
@@ -0,0 +1,71 @@
1
+ $: << File.dirname(__FILE__)+"/../lib"
2
+
3
+ require 'test/unit'
4
+ require 'address_extractor.rb'
5
+
6
+ class AddressExtractorTest < Test::Unit::TestCase
7
+
8
+ def test_first_address_extraction
9
+ address = AddressExtractor.first_address(DATA1)
10
+ assert_first_address(address)
11
+ end
12
+
13
+ def test_find_addresses
14
+ addresses = AddressExtractor.find_addresses(DATA1)
15
+ assert_first_address addresses[0]
16
+ assert_second_address addresses[1]
17
+ end
18
+
19
+ def test_replace_first_address
20
+ string = AddressExtractor.replace_first_address(DATA1) do |address_hash, address|
21
+ assert_first_address address_hash
22
+ assert_first_address_string address
23
+ "skidoosh"
24
+ end
25
+ assert string =~ /Please send the package to skidoosh/
26
+ end
27
+
28
+ def test_replace_addresses
29
+ string = AddressExtractor.replace_addresses(DATA1) do |address_hash, address|
30
+ "skidoosh"
31
+ end
32
+ assert string =~ /Please send the package to skidoosh/
33
+ assert string =~ /via mail at:\n skidoosh/
34
+ end
35
+
36
+ module Helpers
37
+ def assert_first_address(a)
38
+ assert_not_nil a
39
+ assert_equal "123 Foo St.", a[:street1]
40
+ assert_equal nil, a[:street2]
41
+ assert_equal "Someplace", a[:city]
42
+ assert_equal "FL", a[:state]
43
+ assert_equal nil, a[:zip]
44
+ end
45
+
46
+ def assert_first_address_string(string)
47
+ assert_match /^123 Foo St\., Someplace FL\s*$/, string
48
+ end
49
+
50
+
51
+ def assert_second_address(a)
52
+ assert_not_nil a
53
+ assert_equal "123 Goob Avenue", a[:street1]
54
+ assert_equal "Apt 123", a[:street2]
55
+ assert_equal "Nice Town", a[:city]
56
+ assert_equal "CA", a[:state]
57
+ assert_equal "123456", a[:zip]
58
+ end
59
+ end
60
+ include Helpers
61
+ end
62
+
63
+ DATA1 = <<EOF
64
+ Please send the package to 123 Foo St., Someplace FL
65
+
66
+ My phone number is 123-1234 and St. Marc of Israel can be reached
67
+ via mail at:
68
+ 123 Goob Avenue
69
+ Apt 123
70
+ Nice Town CA 123456
71
+ EOF
metadata ADDED
@@ -0,0 +1,66 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: coderifous-address_extractor
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.1.0
5
+ platform: ruby
6
+ authors:
7
+ - Jim Garvin
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+
12
+ date: 2008-11-14 00:00:00 -08:00
13
+ default_executable:
14
+ dependencies: []
15
+
16
+ description: Give it text. It finds addresses in it.
17
+ email: jim at thegarvin dot com
18
+ executables: []
19
+
20
+ extensions: []
21
+
22
+ extra_rdoc_files:
23
+ - lib/address_extractor.rb
24
+ - LICENSE.textile
25
+ - README.textile
26
+ files:
27
+ - lib/address_extractor.rb
28
+ - LICENSE.textile
29
+ - Manifest
30
+ - Rakefile
31
+ - README.textile
32
+ - test/test_address_extractor.rb
33
+ - address_extractor.gemspec
34
+ has_rdoc: true
35
+ homepage: http://github.com/coderifous/address_extractor
36
+ post_install_message:
37
+ rdoc_options:
38
+ - --line-numbers
39
+ - --inline-source
40
+ - --title
41
+ - Address_extractor
42
+ - --main
43
+ - README.textile
44
+ require_paths:
45
+ - lib
46
+ required_ruby_version: !ruby/object:Gem::Requirement
47
+ requirements:
48
+ - - ">="
49
+ - !ruby/object:Gem::Version
50
+ version: "0"
51
+ version:
52
+ required_rubygems_version: !ruby/object:Gem::Requirement
53
+ requirements:
54
+ - - ">="
55
+ - !ruby/object:Gem::Version
56
+ version: "1.2"
57
+ version:
58
+ requirements: []
59
+
60
+ rubyforge_project: address_extractor
61
+ rubygems_version: 1.2.0
62
+ signing_key:
63
+ specification_version: 2
64
+ summary: Give it text. It finds addresses in it.
65
+ test_files:
66
+ - test/test_address_extractor.rb