textutils 0.8.3 → 0.8.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Manifest.txt +1 -0
- data/lib/textutils/helper/address_helper.rb +133 -21
- data/lib/textutils/helper/title_helper.rb +4 -2
- data/lib/textutils/version.rb +1 -1
- data/test/test_address_helper.rb +108 -0
- data/test/test_title_helper.rb +3 -1
- metadata +10 -8
data/Manifest.txt
CHANGED
@@ -4,21 +4,34 @@
|
|
4
4
|
module TextUtils
|
5
5
|
module AddressHelper
|
6
6
|
|
7
|
-
def normalize_addr( old_address )
|
8
|
-
|
7
|
+
def normalize_addr( old_address, country_key=nil )
|
8
|
+
|
9
|
+
# for now only checks german (de) 5-digit zip code and
|
10
|
+
# austrian (at) 4-digit zip code
|
9
11
|
#
|
10
12
|
# e.g. Alte Plauener Straße 24 // 95028 Hof becomes
|
11
13
|
# 95028 Hof // Alte Plauener Straße 24
|
12
14
|
|
15
|
+
if country_key.nil?
|
16
|
+
puts "TextUtils.normalize_addr drepreciated call - country_key now required; please add !!"
|
17
|
+
return old_address
|
18
|
+
end
|
19
|
+
|
13
20
|
new_address = old_address # default - do nothing - just path through
|
14
|
-
|
21
|
+
|
15
22
|
lines = old_address.split( '//' )
|
16
|
-
|
23
|
+
|
17
24
|
if lines.size == 2 # two lines / check for switching lines
|
25
|
+
|
18
26
|
line1 = lines[0].strip
|
19
27
|
line2 = lines[1].strip
|
20
|
-
|
21
|
-
|
28
|
+
|
29
|
+
regex_nnnn = /^[0-9]{4}\s+/ # four digits postal code
|
30
|
+
regex_nnnnn = /^[0-9]{5}\s+/ # five digits postal code
|
31
|
+
|
32
|
+
if (country_key == 'at' && line2 =~ regex_nnnn ) ||
|
33
|
+
(country_key == 'de' && line2 =~ regex_nnnnn )
|
34
|
+
new_address = "#{line2} // #{line1}"
|
22
35
|
end
|
23
36
|
end
|
24
37
|
|
@@ -26,28 +39,127 @@ module TextUtils
|
|
26
39
|
end
|
27
40
|
|
28
41
|
|
29
|
-
|
30
|
-
# -- make country_key optional - why? why not?
|
31
|
-
# n move to second pos; use opts={} why? why not?
|
42
|
+
def find_city_in_addr_without_postal_code( address )
|
32
43
|
|
33
|
-
|
44
|
+
## general rule; not country-specific; no postal code/zip code or state
|
45
|
+
# - must be like two lines (one line empty) e.g.
|
46
|
+
# // London or
|
47
|
+
# London //
|
48
|
+
# will assume entry is city
|
49
|
+
# note: city may NOT include numbers, or pipe (|) or comma (,) chars
|
50
|
+
|
51
|
+
# fix: use blank?
|
52
|
+
return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
|
34
53
|
|
35
|
-
|
54
|
+
old_lines = address.split( '//' )
|
55
|
+
|
56
|
+
###
|
57
|
+
# note: London // will get split into arry with size 1 e.g. ['London ']
|
58
|
+
# support it, that is, add missing empty line
|
59
|
+
|
60
|
+
# 1) strip lines
|
61
|
+
# 2) remove blank lines
|
62
|
+
lines = []
|
63
|
+
|
64
|
+
old_lines.each do |line|
|
65
|
+
linec = line.strip
|
66
|
+
next if linec.empty?
|
67
|
+
lines << linec
|
68
|
+
end
|
69
|
+
|
70
|
+
if lines.size == 1
|
71
|
+
linec = lines[0]
|
72
|
+
# note: city may NOT include
|
73
|
+
# numbers (e.g. assumes zip/postal code etc.) or
|
74
|
+
# pipe (|) or
|
75
|
+
# comma (,)
|
76
|
+
if linec =~ /[0-9|,]/
|
77
|
+
return nil
|
78
|
+
end
|
79
|
+
# more than two uppercase letters e.g. TX NY etc.
|
80
|
+
# check if city exists wit tow uppercase letters??
|
81
|
+
if linec =~ /[A-Z]{2,}/
|
82
|
+
return nil
|
83
|
+
end
|
84
|
+
return linec # bingo!!! assume candidate line is a city name
|
85
|
+
end
|
86
|
+
|
87
|
+
nil # no generic city match found
|
88
|
+
end
|
89
|
+
|
90
|
+
|
91
|
+
def find_city_in_addr_with_postal_code( address, country_key )
|
92
|
+
|
93
|
+
# fix: use blank?
|
94
|
+
return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
|
36
95
|
|
37
96
|
lines = address.split( '//' )
|
38
97
|
|
39
|
-
if country_key == 'at' || country_key == '
|
40
|
-
#
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
47
|
-
|
98
|
+
if country_key == 'at' || country_key == 'be'
|
99
|
+
# support for now
|
100
|
+
# - 2018 Antwerpen or 2870 Breendonk-Puurs (be)
|
101
|
+
lines.each do |line|
|
102
|
+
linec = line.strip
|
103
|
+
regex_nnnn = /^[0-9]{4}\s+/
|
104
|
+
if linec =~ regex_nnnn # must start w/ four digit postal code ? assume its the city line
|
105
|
+
return linec.sub( regex_nnnn, '' ) # cut off leading postal code; assume rest is city
|
106
|
+
end
|
107
|
+
end
|
108
|
+
elsif country_key == 'de'
|
109
|
+
lines.each do |line|
|
110
|
+
linec = line.strip
|
111
|
+
regex_nnnnn = /^[0-9]{5}\s+/
|
112
|
+
if linec =~ regex_nnnnn # must start w/ five digit postal code ? assume its the city line
|
113
|
+
return linec.sub( regex_nnnnn, '' ) # cut off leading postal code; assume rest is city
|
114
|
+
end
|
115
|
+
end
|
116
|
+
elsif country_key == 'cz' || country_key == 'sk'
|
117
|
+
# support for now
|
118
|
+
# - 284 15 Kutná Hora or 288 25 Nymburk (cz)
|
119
|
+
# - 036 42 Martin or 974 05 Banská Bystrica (sk)
|
120
|
+
lines.each do |line|
|
121
|
+
linec = line.strip
|
122
|
+
regex_nnn_nn = /^[0-9]{3}\s[0-9]{2}\s+/
|
123
|
+
if linec =~ regex_nnn_nn # must start w/ five digit postal code ? assume its the city line
|
124
|
+
return linec.sub( regex_nnn_nn, '' ) # cut off leading postal code; assume rest is city
|
125
|
+
end
|
126
|
+
end
|
127
|
+
elsif country_key == 'us'
|
128
|
+
# support for now
|
129
|
+
# - Brooklyn | NY 11249 or Brooklyn, NY 11249
|
130
|
+
# - Brooklyn | NY or Brooklyn, NY
|
131
|
+
|
132
|
+
lines.each do |line|
|
133
|
+
linec = line.strip
|
134
|
+
regexes_us = [/\s*[|,]\s+[A-Z]{2}\s+[0-9]{5}\s*$/,
|
135
|
+
/\s*[|,]\s+[A-Z]{2}\s*$/]
|
136
|
+
|
137
|
+
regexes_us.each do |regex|
|
138
|
+
if linec =~ regex
|
139
|
+
return linec.sub( regex, '' ) # cut off leading postal code; assume rest is city
|
140
|
+
end
|
141
|
+
end
|
142
|
+
end
|
48
143
|
else
|
49
|
-
|
144
|
+
# unsupported country/address schema for now; sorry
|
50
145
|
end
|
146
|
+
return nil # sorry nothing found
|
147
|
+
end
|
148
|
+
|
149
|
+
|
150
|
+
def find_city_in_addr( address, country_key )
|
151
|
+
|
152
|
+
# fix: use blank?
|
153
|
+
return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
|
154
|
+
|
155
|
+
## try geneneric rule first (e.g. w/o postal code/zip code or state), see above
|
156
|
+
city = find_city_in_addr_without_postal_code( address )
|
157
|
+
return city unless city.nil?
|
158
|
+
|
159
|
+
city = find_city_in_addr_with_postal_code( address, country_key )
|
160
|
+
return city unless city.nil?
|
161
|
+
|
162
|
+
nil # sorry; no city found (using known patterns)
|
51
163
|
end
|
52
164
|
|
53
165
|
|
@@ -49,10 +49,11 @@ module TextUtils
|
|
49
49
|
end
|
50
50
|
|
51
51
|
def strip_special_chars( title )
|
52
|
-
# remove special chars (e.g.
|
52
|
+
# remove special chars (e.g. %°&$)
|
53
53
|
# e.g. +Malta
|
54
54
|
# Minerva 8:60
|
55
|
-
|
55
|
+
# $Alianz$ Arena
|
56
|
+
title.gsub( /[%&°+:$]/, '' )
|
56
57
|
end
|
57
58
|
|
58
59
|
def title_to_key( title )
|
@@ -112,6 +113,7 @@ module TextUtils
|
|
112
113
|
['ő', 'o' ], # e.g. Győri
|
113
114
|
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
114
115
|
['õ', 'o' ], # e.g. Nõmme
|
116
|
+
['ô', 'o' ], # e.g. Amazônia (pt)
|
115
117
|
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
116
118
|
['ř', 'r' ], # e.g. Třeboň
|
117
119
|
['ș', 's' ], # e.g. Chișinău, București
|
data/lib/textutils/version.rb
CHANGED
@@ -0,0 +1,108 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
require 'helper'
|
5
|
+
|
6
|
+
class TestAddressHelper < MiniTest::Unit::TestCase
|
7
|
+
|
8
|
+
def test_normalize_addr
|
9
|
+
|
10
|
+
txt_io = [
|
11
|
+
['Alte Plauener Straße 24 // 95028 Hof', nil, 'Alte Plauener Straße 24 // 95028 Hof'],
|
12
|
+
['Alte Plauener Straße 24 // 95028 Hof', 'de', '95028 Hof // Alte Plauener Straße 24'],
|
13
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', nil, 'Mautner Markhof-Straße 11 // 2320 Schwechat'],
|
14
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', '2320 Schwechat // Mautner Markhof-Straße 11']
|
15
|
+
]
|
16
|
+
|
17
|
+
txt_io.each_with_index do |txt,i|
|
18
|
+
puts "testing [#{i}] #{txt[0]}"
|
19
|
+
assert_equal txt[2], TextUtils.normalize_addr( txt[0], txt[1] )
|
20
|
+
end
|
21
|
+
|
22
|
+
end # method test_normalize_addr
|
23
|
+
|
24
|
+
|
25
|
+
def test_addr_without_postal_code # aka generic rule
|
26
|
+
|
27
|
+
txt_io = [
|
28
|
+
['London //', 'London'],
|
29
|
+
['// London', 'London'],
|
30
|
+
['// London ', 'London'],
|
31
|
+
[' // London', 'London'],
|
32
|
+
['// London, W4 2QB', nil],
|
33
|
+
['// London | W4 2QB', nil],
|
34
|
+
['// London W4 2QB', nil],
|
35
|
+
['Chiswick Lane South // London, W4 2QB', nil],
|
36
|
+
['The Griffin Brewery // Chiswick Lane South // London', nil], # three lines will NOT work, sorry
|
37
|
+
['// New York, NY', nil],
|
38
|
+
['// New York NY', nil] # check: does it exist in the real world (e.g. w/o comma or pipe?) support it?
|
39
|
+
]
|
40
|
+
|
41
|
+
txt_io.each_with_index do |txt,i|
|
42
|
+
puts "testing [#{i}] #{txt[0]}"
|
43
|
+
assert_equal txt[1], TextUtils.find_city_in_addr_without_postal_code( txt[0] )
|
44
|
+
end
|
45
|
+
end # method test_addr_without_postal_code
|
46
|
+
|
47
|
+
|
48
|
+
def test_addr_with_postal_code
|
49
|
+
|
50
|
+
txt_io = [
|
51
|
+
['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
|
52
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
|
53
|
+
['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
|
54
|
+
['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
|
55
|
+
['2018 Antwerpen', 'be', 'Antwerpen'],
|
56
|
+
['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
|
57
|
+
['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
|
58
|
+
['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
|
59
|
+
['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
|
60
|
+
['288 25 Nymburk', 'cz', 'Nymburk'],
|
61
|
+
['036 42 Martin', 'sk', 'Martin'],
|
62
|
+
['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
|
63
|
+
['Brooklyn | NY 11249', 'us', 'Brooklyn'],
|
64
|
+
['Brooklyn, NY 11249', 'us', 'Brooklyn'],
|
65
|
+
['Brooklyn | NY', 'us', 'Brooklyn'],
|
66
|
+
['Brooklyn, NY', 'us', 'Brooklyn'],
|
67
|
+
]
|
68
|
+
|
69
|
+
txt_io.each_with_index do |txt,i|
|
70
|
+
puts "testing [#{i}] #{txt[0]}"
|
71
|
+
assert_equal txt[2], TextUtils.find_city_in_addr_with_postal_code( txt[0], txt[1] )
|
72
|
+
end
|
73
|
+
end # method test_addr_with_postal_code
|
74
|
+
|
75
|
+
|
76
|
+
def test_addr
|
77
|
+
|
78
|
+
txt_io = [
|
79
|
+
['London //', nil, 'London'],
|
80
|
+
['// London', nil, 'London'],
|
81
|
+
['// London ', nil, 'London'],
|
82
|
+
[' // London', nil, 'London'],
|
83
|
+
['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
|
84
|
+
['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
|
85
|
+
['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
|
86
|
+
['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
|
87
|
+
['2018 Antwerpen', 'be', 'Antwerpen'],
|
88
|
+
['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
|
89
|
+
['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
|
90
|
+
['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
|
91
|
+
['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
|
92
|
+
['288 25 Nymburk', 'cz', 'Nymburk'],
|
93
|
+
['036 42 Martin', 'sk', 'Martin'],
|
94
|
+
['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
|
95
|
+
['Brooklyn | NY 11249', 'us', 'Brooklyn'],
|
96
|
+
['Brooklyn, NY 11249', 'us', 'Brooklyn'],
|
97
|
+
['Brooklyn | NY', 'us', 'Brooklyn'],
|
98
|
+
['Brooklyn, NY', 'us', 'Brooklyn'],
|
99
|
+
]
|
100
|
+
|
101
|
+
txt_io.each_with_index do |txt,i|
|
102
|
+
puts "testing [#{i}] #{txt[0]}"
|
103
|
+
assert_equal txt[2], TextUtils.find_city_in_addr( txt[0], txt[1] )
|
104
|
+
end
|
105
|
+
end # method test_addr
|
106
|
+
|
107
|
+
|
108
|
+
end # class TestAddressHelper
|
data/test/test_title_helper.rb
CHANGED
@@ -34,7 +34,9 @@ class TestTitleHelper < MiniTest::Unit::TestCase
|
|
34
34
|
[ '+Lupulus', 'lupulus' ],
|
35
35
|
[ '+Malta', 'malta' ],
|
36
36
|
[ 'Minerva 8:60', 'minerva860' ],
|
37
|
-
[ 'Hop Crisis!', 'hopcrisis' ]
|
37
|
+
[ 'Hop Crisis!', 'hopcrisis' ],
|
38
|
+
[ '$Alianz$ Arena', 'alianzarena' ],
|
39
|
+
[ 'Arena Amazônia', 'arenaamazonia' ]
|
38
40
|
]
|
39
41
|
|
40
42
|
txt_io.each do |txt|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.8.
|
4
|
+
version: 0.8.4
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2014-
|
12
|
+
date: 2014-02-12 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &20379624 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *20379624
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &20377728 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '4.0'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *20377728
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &20376972 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '3.7'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *20376972
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
48
|
email: ruby-talk@ruby-lang.org
|
49
49
|
executables: []
|
@@ -81,6 +81,7 @@ files:
|
|
81
81
|
- lib/textutils/utils.rb
|
82
82
|
- lib/textutils/version.rb
|
83
83
|
- test/helper.rb
|
84
|
+
- test/test_address_helper.rb
|
84
85
|
- test/test_hypertext_helper.rb
|
85
86
|
- test/test_title_helper.rb
|
86
87
|
- test/test_unicode_helper.rb
|
@@ -114,6 +115,7 @@ signing_key:
|
|
114
115
|
specification_version: 3
|
115
116
|
summary: textutils - Text Filters, Helpers, Readers and More
|
116
117
|
test_files:
|
118
|
+
- test/test_address_helper.rb
|
117
119
|
- test/test_hypertext_helper.rb
|
118
120
|
- test/test_title_helper.rb
|
119
121
|
- test/test_unicode_helper.rb
|