textutils 0.8.3 → 0.8.4

Sign up to get free protection for your applications and to get access to all the features.
data/Manifest.txt CHANGED
@@ -26,6 +26,7 @@ lib/textutils/sanitizier.rb
26
26
  lib/textutils/utils.rb
27
27
  lib/textutils/version.rb
28
28
  test/helper.rb
29
+ test/test_address_helper.rb
29
30
  test/test_hypertext_helper.rb
30
31
  test/test_title_helper.rb
31
32
  test/test_unicode_helper.rb
@@ -4,21 +4,34 @@
4
4
  module TextUtils
5
5
  module AddressHelper
6
6
 
7
- def normalize_addr( old_address )
8
- # for now only checks german 5-digit zip code
7
+ def normalize_addr( old_address, country_key=nil )
8
+
9
+ # for now only checks german (de) 5-digit zip code and
10
+ # austrian (at) 4-digit zip code
9
11
  #
10
12
  # e.g. Alte Plauener Straße 24 // 95028 Hof becomes
11
13
  # 95028 Hof // Alte Plauener Straße 24
12
14
 
15
+ if country_key.nil?
16
+ puts "TextUtils.normalize_addr drepreciated call - country_key now required; please add !!"
17
+ return old_address
18
+ end
19
+
13
20
  new_address = old_address # default - do nothing - just path through
14
-
21
+
15
22
  lines = old_address.split( '//' )
16
-
23
+
17
24
  if lines.size == 2 # two lines / check for switching lines
25
+
18
26
  line1 = lines[0].strip
19
27
  line2 = lines[1].strip
20
- if line2 =~ /^[0-9]{5}\s/
21
- new_address = "#{line2} // #{line1}" # swap - let line w/ 5-digit zip code go first
28
+
29
+ regex_nnnn = /^[0-9]{4}\s+/ # four digits postal code
30
+ regex_nnnnn = /^[0-9]{5}\s+/ # five digits postal code
31
+
32
+ if (country_key == 'at' && line2 =~ regex_nnnn ) ||
33
+ (country_key == 'de' && line2 =~ regex_nnnnn )
34
+ new_address = "#{line2} // #{line1}"
22
35
  end
23
36
  end
24
37
 
@@ -26,28 +39,127 @@ module TextUtils
26
39
  end
27
40
 
28
41
 
29
- # todo/fix: add _in_adr or _in_addr to name - why? why not?
30
- # -- make country_key optional - why? why not?
31
- # n move to second pos; use opts={} why? why not?
42
+ def find_city_in_addr_without_postal_code( address )
32
43
 
33
- def find_city_in_addr( address, country_key )
44
+ ## general rule; not country-specific; no postal code/zip code or state
45
+ # - must be like two lines (one line empty) e.g.
46
+ # // London or
47
+ # London //
48
+ # will assume entry is city
49
+ # note: city may NOT include numbers, or pipe (|) or comma (,) chars
50
+
51
+ # fix: use blank?
52
+ return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
34
53
 
35
- return nil if address.blank? # do NOT process nil or empty address lines; sorry
54
+ old_lines = address.split( '//' )
55
+
56
+ ###
57
+ # note: London // will get split into arry with size 1 e.g. ['London ']
58
+ # support it, that is, add missing empty line
59
+
60
+ # 1) strip lines
61
+ # 2) remove blank lines
62
+ lines = []
63
+
64
+ old_lines.each do |line|
65
+ linec = line.strip
66
+ next if linec.empty?
67
+ lines << linec
68
+ end
69
+
70
+ if lines.size == 1
71
+ linec = lines[0]
72
+ # note: city may NOT include
73
+ # numbers (e.g. assumes zip/postal code etc.) or
74
+ # pipe (|) or
75
+ # comma (,)
76
+ if linec =~ /[0-9|,]/
77
+ return nil
78
+ end
79
+ # more than two uppercase letters e.g. TX NY etc.
80
+ # check if city exists wit tow uppercase letters??
81
+ if linec =~ /[A-Z]{2,}/
82
+ return nil
83
+ end
84
+ return linec # bingo!!! assume candidate line is a city name
85
+ end
86
+
87
+ nil # no generic city match found
88
+ end
89
+
90
+
91
+ def find_city_in_addr_with_postal_code( address, country_key )
92
+
93
+ # fix: use blank?
94
+ return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
36
95
 
37
96
  lines = address.split( '//' )
38
97
 
39
- if country_key == 'at' || country_key == 'de'
40
- # first line strip numbers (assuming zip code) and whitespace
41
- line1 = lines[0]
42
- line1 = line1.gsub( /\b[0-9]+\b/, '' ) # use word boundries (why? why not?)
43
- line1 = line1.strip
44
-
45
- return nil if line1.blank? # nothing left sorry; better return nil
46
-
47
- line1 # assume its the city
98
+ if country_key == 'at' || country_key == 'be'
99
+ # support for now
100
+ # - 2018 Antwerpen or 2870 Breendonk-Puurs (be)
101
+ lines.each do |line|
102
+ linec = line.strip
103
+ regex_nnnn = /^[0-9]{4}\s+/
104
+ if linec =~ regex_nnnn # must start w/ four digit postal code ? assume its the city line
105
+ return linec.sub( regex_nnnn, '' ) # cut off leading postal code; assume rest is city
106
+ end
107
+ end
108
+ elsif country_key == 'de'
109
+ lines.each do |line|
110
+ linec = line.strip
111
+ regex_nnnnn = /^[0-9]{5}\s+/
112
+ if linec =~ regex_nnnnn # must start w/ five digit postal code ? assume its the city line
113
+ return linec.sub( regex_nnnnn, '' ) # cut off leading postal code; assume rest is city
114
+ end
115
+ end
116
+ elsif country_key == 'cz' || country_key == 'sk'
117
+ # support for now
118
+ # - 284 15 Kutná Hora or 288 25 Nymburk (cz)
119
+ # - 036 42 Martin or 974 05 Banská Bystrica (sk)
120
+ lines.each do |line|
121
+ linec = line.strip
122
+ regex_nnn_nn = /^[0-9]{3}\s[0-9]{2}\s+/
123
+ if linec =~ regex_nnn_nn # must start w/ five digit postal code ? assume its the city line
124
+ return linec.sub( regex_nnn_nn, '' ) # cut off leading postal code; assume rest is city
125
+ end
126
+ end
127
+ elsif country_key == 'us'
128
+ # support for now
129
+ # - Brooklyn | NY 11249 or Brooklyn, NY 11249
130
+ # - Brooklyn | NY or Brooklyn, NY
131
+
132
+ lines.each do |line|
133
+ linec = line.strip
134
+ regexes_us = [/\s*[|,]\s+[A-Z]{2}\s+[0-9]{5}\s*$/,
135
+ /\s*[|,]\s+[A-Z]{2}\s*$/]
136
+
137
+ regexes_us.each do |regex|
138
+ if linec =~ regex
139
+ return linec.sub( regex, '' ) # cut off leading postal code; assume rest is city
140
+ end
141
+ end
142
+ end
48
143
  else
49
- nil # unsupported country/address schema for now; sorry
144
+ # unsupported country/address schema for now; sorry
50
145
  end
146
+ return nil # sorry nothing found
147
+ end
148
+
149
+
150
+ def find_city_in_addr( address, country_key )
151
+
152
+ # fix: use blank?
153
+ return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
154
+
155
+ ## try geneneric rule first (e.g. w/o postal code/zip code or state), see above
156
+ city = find_city_in_addr_without_postal_code( address )
157
+ return city unless city.nil?
158
+
159
+ city = find_city_in_addr_with_postal_code( address, country_key )
160
+ return city unless city.nil?
161
+
162
+ nil # sorry; no city found (using known patterns)
51
163
  end
52
164
 
53
165
 
@@ -49,10 +49,11 @@ module TextUtils
49
49
  end
50
50
 
51
51
  def strip_special_chars( title )
52
- # remove special chars (e.g. %°&)
52
+ # remove special chars (e.g. %°&$)
53
53
  # e.g. +Malta
54
54
  # Minerva 8:60
55
- title.gsub( /[%&°+:]/, '' )
55
+ # $Alianz$ Arena
56
+ title.gsub( /[%&°+:$]/, '' )
56
57
  end
57
58
 
58
59
  def title_to_key( title )
@@ -112,6 +113,7 @@ module TextUtils
112
113
  ['ő', 'o' ], # e.g. Győri
113
114
  ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
114
115
  ['õ', 'o' ], # e.g. Nõmme
116
+ ['ô', 'o' ], # e.g. Amazônia (pt)
115
117
  ['ø', 'o' ], # e.g. Fuglafjørdur, København
116
118
  ['ř', 'r' ], # e.g. Třeboň
117
119
  ['ș', 's' ], # e.g. Chișinău, București
@@ -1,7 +1,7 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.8.3'
4
+ VERSION = '0.8.4'
5
5
 
6
6
  end # module TextUtils
7
7
 
@@ -0,0 +1,108 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+ class TestAddressHelper < MiniTest::Unit::TestCase
7
+
8
+ def test_normalize_addr
9
+
10
+ txt_io = [
11
+ ['Alte Plauener Straße 24 // 95028 Hof', nil, 'Alte Plauener Straße 24 // 95028 Hof'],
12
+ ['Alte Plauener Straße 24 // 95028 Hof', 'de', '95028 Hof // Alte Plauener Straße 24'],
13
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', nil, 'Mautner Markhof-Straße 11 // 2320 Schwechat'],
14
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', '2320 Schwechat // Mautner Markhof-Straße 11']
15
+ ]
16
+
17
+ txt_io.each_with_index do |txt,i|
18
+ puts "testing [#{i}] #{txt[0]}"
19
+ assert_equal txt[2], TextUtils.normalize_addr( txt[0], txt[1] )
20
+ end
21
+
22
+ end # method test_normalize_addr
23
+
24
+
25
+ def test_addr_without_postal_code # aka generic rule
26
+
27
+ txt_io = [
28
+ ['London //', 'London'],
29
+ ['// London', 'London'],
30
+ ['// London ', 'London'],
31
+ [' // London', 'London'],
32
+ ['// London, W4 2QB', nil],
33
+ ['// London | W4 2QB', nil],
34
+ ['// London W4 2QB', nil],
35
+ ['Chiswick Lane South // London, W4 2QB', nil],
36
+ ['The Griffin Brewery // Chiswick Lane South // London', nil], # three lines will NOT work, sorry
37
+ ['// New York, NY', nil],
38
+ ['// New York NY', nil] # check: does it exist in the real world (e.g. w/o comma or pipe?) support it?
39
+ ]
40
+
41
+ txt_io.each_with_index do |txt,i|
42
+ puts "testing [#{i}] #{txt[0]}"
43
+ assert_equal txt[1], TextUtils.find_city_in_addr_without_postal_code( txt[0] )
44
+ end
45
+ end # method test_addr_without_postal_code
46
+
47
+
48
+ def test_addr_with_postal_code
49
+
50
+ txt_io = [
51
+ ['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
52
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
53
+ ['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
54
+ ['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
55
+ ['2018 Antwerpen', 'be', 'Antwerpen'],
56
+ ['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
57
+ ['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
58
+ ['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
59
+ ['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
60
+ ['288 25 Nymburk', 'cz', 'Nymburk'],
61
+ ['036 42 Martin', 'sk', 'Martin'],
62
+ ['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
63
+ ['Brooklyn | NY 11249', 'us', 'Brooklyn'],
64
+ ['Brooklyn, NY 11249', 'us', 'Brooklyn'],
65
+ ['Brooklyn | NY', 'us', 'Brooklyn'],
66
+ ['Brooklyn, NY', 'us', 'Brooklyn'],
67
+ ]
68
+
69
+ txt_io.each_with_index do |txt,i|
70
+ puts "testing [#{i}] #{txt[0]}"
71
+ assert_equal txt[2], TextUtils.find_city_in_addr_with_postal_code( txt[0], txt[1] )
72
+ end
73
+ end # method test_addr_with_postal_code
74
+
75
+
76
+ def test_addr
77
+
78
+ txt_io = [
79
+ ['London //', nil, 'London'],
80
+ ['// London', nil, 'London'],
81
+ ['// London ', nil, 'London'],
82
+ [' // London', nil, 'London'],
83
+ ['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
84
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
85
+ ['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
86
+ ['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
87
+ ['2018 Antwerpen', 'be', 'Antwerpen'],
88
+ ['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
89
+ ['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
90
+ ['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
91
+ ['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
92
+ ['288 25 Nymburk', 'cz', 'Nymburk'],
93
+ ['036 42 Martin', 'sk', 'Martin'],
94
+ ['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
95
+ ['Brooklyn | NY 11249', 'us', 'Brooklyn'],
96
+ ['Brooklyn, NY 11249', 'us', 'Brooklyn'],
97
+ ['Brooklyn | NY', 'us', 'Brooklyn'],
98
+ ['Brooklyn, NY', 'us', 'Brooklyn'],
99
+ ]
100
+
101
+ txt_io.each_with_index do |txt,i|
102
+ puts "testing [#{i}] #{txt[0]}"
103
+ assert_equal txt[2], TextUtils.find_city_in_addr( txt[0], txt[1] )
104
+ end
105
+ end # method test_addr
106
+
107
+
108
+ end # class TestAddressHelper
@@ -34,7 +34,9 @@ class TestTitleHelper < MiniTest::Unit::TestCase
34
34
  [ '+Lupulus', 'lupulus' ],
35
35
  [ '+Malta', 'malta' ],
36
36
  [ 'Minerva 8:60', 'minerva860' ],
37
- [ 'Hop Crisis!', 'hopcrisis' ]
37
+ [ 'Hop Crisis!', 'hopcrisis' ],
38
+ [ '$Alianz$ Arena', 'alianzarena' ],
39
+ [ 'Arena Amazônia', 'arenaamazonia' ]
38
40
  ]
39
41
 
40
42
  txt_io.each do |txt|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.3
4
+ version: 0.8.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-01-23 00:00:00.000000000 Z
12
+ date: 2014-02-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &21298812 !ruby/object:Gem::Requirement
16
+ requirement: &20379624 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *21298812
24
+ version_requirements: *20379624
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &21298416 !ruby/object:Gem::Requirement
27
+ requirement: &20377728 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '4.0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *21298416
35
+ version_requirements: *20377728
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &21297996 !ruby/object:Gem::Requirement
38
+ requirement: &20376972 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '3.7'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *21297996
46
+ version_requirements: *20376972
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
48
  email: ruby-talk@ruby-lang.org
49
49
  executables: []
@@ -81,6 +81,7 @@ files:
81
81
  - lib/textutils/utils.rb
82
82
  - lib/textutils/version.rb
83
83
  - test/helper.rb
84
+ - test/test_address_helper.rb
84
85
  - test/test_hypertext_helper.rb
85
86
  - test/test_title_helper.rb
86
87
  - test/test_unicode_helper.rb
@@ -114,6 +115,7 @@ signing_key:
114
115
  specification_version: 3
115
116
  summary: textutils - Text Filters, Helpers, Readers and More
116
117
  test_files:
118
+ - test/test_address_helper.rb
117
119
  - test/test_hypertext_helper.rb
118
120
  - test/test_title_helper.rb
119
121
  - test/test_unicode_helper.rb