textutils 0.8.3 → 0.8.4
Sign up to get free protection for your applications and to get access to all the features.
- data/Manifest.txt +1 -0
- data/lib/textutils/helper/address_helper.rb +133 -21
- data/lib/textutils/helper/title_helper.rb +4 -2
- data/lib/textutils/version.rb +1 -1
- data/test/test_address_helper.rb +108 -0
- data/test/test_title_helper.rb +3 -1
- metadata +10 -8
data/Manifest.txt
CHANGED
@@ -4,21 +4,34 @@
|
|
4
4
|
module TextUtils
|
5
5
|
module AddressHelper
|
6
6
|
|
7
|
-
def normalize_addr( old_address )
|
8
|
-
|
7
|
+
def normalize_addr( old_address, country_key=nil )
|
8
|
+
|
9
|
+
# for now only checks german (de) 5-digit zip code and
|
10
|
+
# austrian (at) 4-digit zip code
|
9
11
|
#
|
10
12
|
# e.g. Alte Plauener Straße 24 // 95028 Hof becomes
|
11
13
|
# 95028 Hof // Alte Plauener Straße 24
|
12
14
|
|
15
|
+
if country_key.nil?
|
16
|
+
puts "TextUtils.normalize_addr drepreciated call - country_key now required; please add !!"
|
17
|
+
return old_address
|
18
|
+
end
|
19
|
+
|
13
20
|
new_address = old_address # default - do nothing - just path through
|
14
|
-
|
21
|
+
|
15
22
|
lines = old_address.split( '//' )
|
16
|
-
|
23
|
+
|
17
24
|
if lines.size == 2 # two lines / check for switching lines
|
25
|
+
|
18
26
|
line1 = lines[0].strip
|
19
27
|
line2 = lines[1].strip
|
20
|
-
|
21
|
-
|
28
|
+
|
29
|
+
regex_nnnn = /^[0-9]{4}\s+/ # four digits postal code
|
30
|
+
regex_nnnnn = /^[0-9]{5}\s+/ # five digits postal code
|
31
|
+
|
32
|
+
if (country_key == 'at' && line2 =~ regex_nnnn ) ||
|
33
|
+
(country_key == 'de' && line2 =~ regex_nnnnn )
|
34
|
+
new_address = "#{line2} // #{line1}"
|
22
35
|
end
|
23
36
|
end
|
24
37
|
|
@@ -26,28 +39,127 @@ module TextUtils
|
|
26
39
|
end
|
27
40
|
|
28
41
|
|
29
|
-
|
30
|
-
# -- make country_key optional - why? why not?
|
31
|
-
# n move to second pos; use opts={} why? why not?
|
42
|
+
def find_city_in_addr_without_postal_code( address )
|
32
43
|
|
33
|
-
|
44
|
+
## general rule; not country-specific; no postal code/zip code or state
|
45
|
+
# - must be like two lines (one line empty) e.g.
|
46
|
+
# // London or
|
47
|
+
# London //
|
48
|
+
# will assume entry is city
|
49
|
+
# note: city may NOT include numbers, or pipe (|) or comma (,) chars
|
50
|
+
|
51
|
+
# fix: use blank?
|
52
|
+
return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
|
34
53
|
|
35
|
-
|
54
|
+
old_lines = address.split( '//' )
|
55
|
+
|
56
|
+
###
|
57
|
+
# note: London // will get split into arry with size 1 e.g. ['London ']
|
58
|
+
# support it, that is, add missing empty line
|
59
|
+
|
60
|
+
# 1) strip lines
|
61
|
+
# 2) remove blank lines
|
62
|
+
lines = []
|
63
|
+
|
64
|
+
old_lines.each do |line|
|
65
|
+
linec = line.strip
|
66
|
+
next if linec.empty?
|
67
|
+
lines << linec
|
68
|
+
end
|
69
|
+
|
70
|
+
if lines.size == 1
|
71
|
+
linec = lines[0]
|
72
|
+
# note: city may NOT include
|
73
|
+
# numbers (e.g. assumes zip/postal code etc.) or
|
74
|
+
# pipe (|) or
|
75
|
+
# comma (,)
|
76
|
+
if linec =~ /[0-9|,]/
|
77
|
+
return nil
|
78
|
+
end
|
79
|
+
# more than two uppercase letters e.g. TX NY etc.
|
80
|
+
# check if city exists wit tow uppercase letters??
|
81
|
+
if linec =~ /[A-Z]{2,}/
|
82
|
+
return nil
|
83
|
+
end
|
84
|
+
return linec # bingo!!! assume candidate line is a city name
|
85
|
+
end
|
86
|
+
|
87
|
+
nil # no generic city match found
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def find_city_in_addr_with_postal_code( address, country_key )
|
92
|
+
|
93
|
+
# fix: use blank?
|
94
|
+
return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
|
36
95
|
|
37
96
|
lines = address.split( '//' )
|
38
97
|
|
39
|
-
if country_key == 'at' || country_key == '
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
98
|
+
if country_key == 'at' || country_key == 'be'
|
99
|
+
# support for now
|
100
|
+
# - 2018 Antwerpen or 2870 Breendonk-Puurs (be)
|
101
|
+
lines.each do |line|
|
102
|
+
linec = line.strip
|
103
|
+
regex_nnnn = /^[0-9]{4}\s+/
|
104
|
+
if linec =~ regex_nnnn # must start w/ four digit postal code ? assume its the city line
|
105
|
+
return linec.sub( regex_nnnn, '' ) # cut off leading postal code; assume rest is city
|
106
|
+
end
|
107
|
+
end
|
108
|
+
elsif country_key == 'de'
|
109
|
+
lines.each do |line|
|
110
|
+
linec = line.strip
|
111
|
+
regex_nnnnn = /^[0-9]{5}\s+/
|
112
|
+
if linec =~ regex_nnnnn # must start w/ five digit postal code ? assume its the city line
|
113
|
+
return linec.sub( regex_nnnnn, '' ) # cut off leading postal code; assume rest is city
|
114
|
+
end
|
115
|
+
end
|
116
|
+
elsif country_key == 'cz' || country_key == 'sk'
|
117
|
+
# support for now
|
118
|
+
# - 284 15 Kutná Hora or 288 25 Nymburk (cz)
|
119
|
+
# - 036 42 Martin or 974 05 Banská Bystrica (sk)
|
120
|
+
lines.each do |line|
|
121
|
+
linec = line.strip
|
122
|
+
regex_nnn_nn = /^[0-9]{3}\s[0-9]{2}\s+/
|
123
|
+
if linec =~ regex_nnn_nn # must start w/ five digit postal code ? assume its the city line
|
124
|
+
return linec.sub( regex_nnn_nn, '' ) # cut off leading postal code; assume rest is city
|
125
|
+
end
|
126
|
+
end
|
127
|
+
elsif country_key == 'us'
|
128
|
+
# support for now
|
129
|
+
# - Brooklyn | NY 11249 or Brooklyn, NY 11249
|
130
|
+
# - Brooklyn | NY or Brooklyn, NY
|
131
|
+
|
132
|
+
lines.each do |line|
|
133
|
+
linec = line.strip
|
134
|
+
regexes_us = [/\s*[|,]\s+[A-Z]{2}\s+[0-9]{5}\s*$/,
|
135
|
+
/\s*[|,]\s+[A-Z]{2}\s*$/]
|
136
|
+
|
137
|
+
regexes_us.each do |regex|
|
138
|
+
if linec =~ regex
|
139
|
+
return linec.sub( regex, '' ) # cut off leading postal code; assume rest is city
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
48
143
|
else
|
49
|
-
|
144
|
+
# unsupported country/address schema for now; sorry
|
50
145
|
end
|
146
|
+
return nil # sorry nothing found
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
def find_city_in_addr( address, country_key )
|
151
|
+
|
152
|
+
# fix: use blank?
|
153
|
+
return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
|
154
|
+
|
155
|
+
## try geneneric rule first (e.g. w/o postal code/zip code or state), see above
|
156
|
+
city = find_city_in_addr_without_postal_code( address )
|
157
|
+
return city unless city.nil?
|
158
|
+
|
159
|
+
city = find_city_in_addr_with_postal_code( address, country_key )
|
160
|
+
return city unless city.nil?
|
161
|
+
|
162
|
+
nil # sorry; no city found (using known patterns)
|
51
163
|
end
|
52
164
|
|
53
165
|
|
@@ -49,10 +49,11 @@ module TextUtils
|
|
49
49
|
end
|
50
50
|
|
51
51
|
def strip_special_chars( title )
|
52
|
-
# remove special chars (e.g.
|
52
|
+
# remove special chars (e.g. %°&$)
|
53
53
|
# e.g. +Malta
|
54
54
|
# Minerva 8:60
|
55
|
-
|
55
|
+
# $Alianz$ Arena
|
56
|
+
title.gsub( /[%&°+:$]/, '' )
|
56
57
|
end
|
57
58
|
|
58
59
|
def title_to_key( title )
|
@@ -112,6 +113,7 @@ module TextUtils
|
|
112
113
|
['ő', 'o' ], # e.g. Győri
|
113
114
|
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
114
115
|
['õ', 'o' ], # e.g. Nõmme
|
116
|
+
['ô', 'o' ], # e.g. Amazônia (pt)
|
115
117
|
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
116
118
|
['ř', 'r' ], # e.g. Třeboň
|
117
119
|
['ș', 's' ], # e.g. Chișinău, București
|
data/lib/textutils/version.rb
CHANGED
@@ -0,0 +1,108 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
class TestAddressHelper < MiniTest::Unit::TestCase
|
7
|
+
|
8
|
+
def test_normalize_addr
|
9
|
+
|
10
|
+
txt_io = [
|
11
|
+
['Alte Plauener Straße 24 // 95028 Hof', nil, 'Alte Plauener Straße 24 // 95028 Hof'],
|
12
|
+
['Alte Plauener Straße 24 // 95028 Hof', 'de', '95028 Hof // Alte Plauener Straße 24'],
|
13
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', nil, 'Mautner Markhof-Straße 11 // 2320 Schwechat'],
|
14
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', '2320 Schwechat // Mautner Markhof-Straße 11']
|
15
|
+
]
|
16
|
+
|
17
|
+
txt_io.each_with_index do |txt,i|
|
18
|
+
puts "testing [#{i}] #{txt[0]}"
|
19
|
+
assert_equal txt[2], TextUtils.normalize_addr( txt[0], txt[1] )
|
20
|
+
end
|
21
|
+
|
22
|
+
end # method test_normalize_addr
|
23
|
+
|
24
|
+
|
25
|
+
def test_addr_without_postal_code # aka generic rule
|
26
|
+
|
27
|
+
txt_io = [
|
28
|
+
['London //', 'London'],
|
29
|
+
['// London', 'London'],
|
30
|
+
['// London ', 'London'],
|
31
|
+
[' // London', 'London'],
|
32
|
+
['// London, W4 2QB', nil],
|
33
|
+
['// London | W4 2QB', nil],
|
34
|
+
['// London W4 2QB', nil],
|
35
|
+
['Chiswick Lane South // London, W4 2QB', nil],
|
36
|
+
['The Griffin Brewery // Chiswick Lane South // London', nil], # three lines will NOT work, sorry
|
37
|
+
['// New York, NY', nil],
|
38
|
+
['// New York NY', nil] # check: does it exist in the real world (e.g. w/o comma or pipe?) support it?
|
39
|
+
]
|
40
|
+
|
41
|
+
txt_io.each_with_index do |txt,i|
|
42
|
+
puts "testing [#{i}] #{txt[0]}"
|
43
|
+
assert_equal txt[1], TextUtils.find_city_in_addr_without_postal_code( txt[0] )
|
44
|
+
end
|
45
|
+
end # method test_addr_without_postal_code
|
46
|
+
|
47
|
+
|
48
|
+
def test_addr_with_postal_code
|
49
|
+
|
50
|
+
txt_io = [
|
51
|
+
['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
|
52
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
|
53
|
+
['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
|
54
|
+
['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
|
55
|
+
['2018 Antwerpen', 'be', 'Antwerpen'],
|
56
|
+
['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
|
57
|
+
['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
|
58
|
+
['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
|
59
|
+
['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
|
60
|
+
['288 25 Nymburk', 'cz', 'Nymburk'],
|
61
|
+
['036 42 Martin', 'sk', 'Martin'],
|
62
|
+
['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
|
63
|
+
['Brooklyn | NY 11249', 'us', 'Brooklyn'],
|
64
|
+
['Brooklyn, NY 11249', 'us', 'Brooklyn'],
|
65
|
+
['Brooklyn | NY', 'us', 'Brooklyn'],
|
66
|
+
['Brooklyn, NY', 'us', 'Brooklyn'],
|
67
|
+
]
|
68
|
+
|
69
|
+
txt_io.each_with_index do |txt,i|
|
70
|
+
puts "testing [#{i}] #{txt[0]}"
|
71
|
+
assert_equal txt[2], TextUtils.find_city_in_addr_with_postal_code( txt[0], txt[1] )
|
72
|
+
end
|
73
|
+
end # method test_addr_with_postal_code
|
74
|
+
|
75
|
+
|
76
|
+
def test_addr
|
77
|
+
|
78
|
+
txt_io = [
|
79
|
+
['London //', nil, 'London'],
|
80
|
+
['// London', nil, 'London'],
|
81
|
+
['// London ', nil, 'London'],
|
82
|
+
[' // London', nil, 'London'],
|
83
|
+
['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
|
84
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
|
85
|
+
['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
|
86
|
+
['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
|
87
|
+
['2018 Antwerpen', 'be', 'Antwerpen'],
|
88
|
+
['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
|
89
|
+
['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
|
90
|
+
['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
|
91
|
+
['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
|
92
|
+
['288 25 Nymburk', 'cz', 'Nymburk'],
|
93
|
+
['036 42 Martin', 'sk', 'Martin'],
|
94
|
+
['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
|
95
|
+
['Brooklyn | NY 11249', 'us', 'Brooklyn'],
|
96
|
+
['Brooklyn, NY 11249', 'us', 'Brooklyn'],
|
97
|
+
['Brooklyn | NY', 'us', 'Brooklyn'],
|
98
|
+
['Brooklyn, NY', 'us', 'Brooklyn'],
|
99
|
+
]
|
100
|
+
|
101
|
+
txt_io.each_with_index do |txt,i|
|
102
|
+
puts "testing [#{i}] #{txt[0]}"
|
103
|
+
assert_equal txt[2], TextUtils.find_city_in_addr( txt[0], txt[1] )
|
104
|
+
end
|
105
|
+
end # method test_addr
|
106
|
+
|
107
|
+
|
108
|
+
end # class TestAddressHelper
|
data/test/test_title_helper.rb
CHANGED
@@ -34,7 +34,9 @@ class TestTitleHelper < MiniTest::Unit::TestCase
|
|
34
34
|
[ '+Lupulus', 'lupulus' ],
|
35
35
|
[ '+Malta', 'malta' ],
|
36
36
|
[ 'Minerva 8:60', 'minerva860' ],
|
37
|
-
[ 'Hop Crisis!', 'hopcrisis' ]
|
37
|
+
[ 'Hop Crisis!', 'hopcrisis' ],
|
38
|
+
[ '$Alianz$ Arena', 'alianzarena' ],
|
39
|
+
[ 'Arena Amazônia', 'arenaamazonia' ]
|
38
40
|
]
|
39
41
|
|
40
42
|
txt_io.each do |txt|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-02-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &20379624 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *20379624
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &20377728 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '4.0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *20377728
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &20376972 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '3.7'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *20376972
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
48
|
email: ruby-talk@ruby-lang.org
|
49
49
|
executables: []
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- lib/textutils/utils.rb
|
82
82
|
- lib/textutils/version.rb
|
83
83
|
- test/helper.rb
|
84
|
+
- test/test_address_helper.rb
|
84
85
|
- test/test_hypertext_helper.rb
|
85
86
|
- test/test_title_helper.rb
|
86
87
|
- test/test_unicode_helper.rb
|
@@ -114,6 +115,7 @@ signing_key:
|
|
114
115
|
specification_version: 3
|
115
116
|
summary: textutils - Text Filters, Helpers, Readers and More
|
116
117
|
test_files:
|
118
|
+
- test/test_address_helper.rb
|
117
119
|
- test/test_hypertext_helper.rb
|
118
120
|
- test/test_title_helper.rb
|
119
121
|
- test/test_unicode_helper.rb
|