textutils 0.8.3 → 0.8.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Manifest.txt CHANGED
@@ -26,6 +26,7 @@ lib/textutils/sanitizier.rb
26
26
  lib/textutils/utils.rb
27
27
  lib/textutils/version.rb
28
28
  test/helper.rb
29
+ test/test_address_helper.rb
29
30
  test/test_hypertext_helper.rb
30
31
  test/test_title_helper.rb
31
32
  test/test_unicode_helper.rb
@@ -4,21 +4,34 @@
4
4
  module TextUtils
5
5
  module AddressHelper
6
6
 
7
- def normalize_addr( old_address )
8
- # for now only checks german 5-digit zip code
7
+ def normalize_addr( old_address, country_key=nil )
8
+
9
+ # for now only checks german (de) 5-digit zip code and
10
+ # austrian (at) 4-digit zip code
9
11
  #
10
12
  # e.g. Alte Plauener Straße 24 // 95028 Hof becomes
11
13
  # 95028 Hof // Alte Plauener Straße 24
12
14
 
15
+ if country_key.nil?
16
+ puts "TextUtils.normalize_addr drepreciated call - country_key now required; please add !!"
17
+ return old_address
18
+ end
19
+
13
20
  new_address = old_address # default - do nothing - just path through
14
-
21
+
15
22
  lines = old_address.split( '//' )
16
-
23
+
17
24
  if lines.size == 2 # two lines / check for switching lines
25
+
18
26
  line1 = lines[0].strip
19
27
  line2 = lines[1].strip
20
- if line2 =~ /^[0-9]{5}\s/
21
- new_address = "#{line2} // #{line1}" # swap - let line w/ 5-digit zip code go first
28
+
29
+ regex_nnnn = /^[0-9]{4}\s+/ # four digits postal code
30
+ regex_nnnnn = /^[0-9]{5}\s+/ # five digits postal code
31
+
32
+ if (country_key == 'at' && line2 =~ regex_nnnn ) ||
33
+ (country_key == 'de' && line2 =~ regex_nnnnn )
34
+ new_address = "#{line2} // #{line1}"
22
35
  end
23
36
  end
24
37
 
@@ -26,28 +39,127 @@ module TextUtils
26
39
  end
27
40
 
28
41
 
29
- # todo/fix: add _in_adr or _in_addr to name - why? why not?
30
- # -- make country_key optional - why? why not?
31
- # n move to second pos; use opts={} why? why not?
42
+ def find_city_in_addr_without_postal_code( address )
32
43
 
33
- def find_city_in_addr( address, country_key )
44
+ ## general rule; not country-specific; no postal code/zip code or state
45
+ # - must be like two lines (one line empty) e.g.
46
+ # // London or
47
+ # London //
48
+ # will assume entry is city
49
+ # note: city may NOT include numbers, or pipe (|) or comma (,) chars
50
+
51
+ # fix: use blank?
52
+ return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
34
53
 
35
- return nil if address.blank? # do NOT process nil or empty address lines; sorry
54
+ old_lines = address.split( '//' )
55
+
56
+ ###
57
+ # note: London // will get split into arry with size 1 e.g. ['London ']
58
+ # support it, that is, add missing empty line
59
+
60
+ # 1) strip lines
61
+ # 2) remove blank lines
62
+ lines = []
63
+
64
+ old_lines.each do |line|
65
+ linec = line.strip
66
+ next if linec.empty?
67
+ lines << linec
68
+ end
69
+
70
+ if lines.size == 1
71
+ linec = lines[0]
72
+ # note: city may NOT include
73
+ # numbers (e.g. assumes zip/postal code etc.) or
74
+ # pipe (|) or
75
+ # comma (,)
76
+ if linec =~ /[0-9|,]/
77
+ return nil
78
+ end
79
+ # more than two uppercase letters e.g. TX NY etc.
80
+ # check if city exists wit tow uppercase letters??
81
+ if linec =~ /[A-Z]{2,}/
82
+ return nil
83
+ end
84
+ return linec # bingo!!! assume candidate line is a city name
85
+ end
86
+
87
+ nil # no generic city match found
88
+ end
89
+
90
+
91
+ def find_city_in_addr_with_postal_code( address, country_key )
92
+
93
+ # fix: use blank?
94
+ return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
36
95
 
37
96
  lines = address.split( '//' )
38
97
 
39
- if country_key == 'at' || country_key == 'de'
40
- # first line strip numbers (assuming zip code) and whitespace
41
- line1 = lines[0]
42
- line1 = line1.gsub( /\b[0-9]+\b/, '' ) # use word boundries (why? why not?)
43
- line1 = line1.strip
44
-
45
- return nil if line1.blank? # nothing left sorry; better return nil
46
-
47
- line1 # assume its the city
98
+ if country_key == 'at' || country_key == 'be'
99
+ # support for now
100
+ # - 2018 Antwerpen or 2870 Breendonk-Puurs (be)
101
+ lines.each do |line|
102
+ linec = line.strip
103
+ regex_nnnn = /^[0-9]{4}\s+/
104
+ if linec =~ regex_nnnn # must start w/ four digit postal code ? assume its the city line
105
+ return linec.sub( regex_nnnn, '' ) # cut off leading postal code; assume rest is city
106
+ end
107
+ end
108
+ elsif country_key == 'de'
109
+ lines.each do |line|
110
+ linec = line.strip
111
+ regex_nnnnn = /^[0-9]{5}\s+/
112
+ if linec =~ regex_nnnnn # must start w/ five digit postal code ? assume its the city line
113
+ return linec.sub( regex_nnnnn, '' ) # cut off leading postal code; assume rest is city
114
+ end
115
+ end
116
+ elsif country_key == 'cz' || country_key == 'sk'
117
+ # support for now
118
+ # - 284 15 Kutná Hora or 288 25 Nymburk (cz)
119
+ # - 036 42 Martin or 974 05 Banská Bystrica (sk)
120
+ lines.each do |line|
121
+ linec = line.strip
122
+ regex_nnn_nn = /^[0-9]{3}\s[0-9]{2}\s+/
123
+ if linec =~ regex_nnn_nn # must start w/ five digit postal code ? assume its the city line
124
+ return linec.sub( regex_nnn_nn, '' ) # cut off leading postal code; assume rest is city
125
+ end
126
+ end
127
+ elsif country_key == 'us'
128
+ # support for now
129
+ # - Brooklyn | NY 11249 or Brooklyn, NY 11249
130
+ # - Brooklyn | NY or Brooklyn, NY
131
+
132
+ lines.each do |line|
133
+ linec = line.strip
134
+ regexes_us = [/\s*[|,]\s+[A-Z]{2}\s+[0-9]{5}\s*$/,
135
+ /\s*[|,]\s+[A-Z]{2}\s*$/]
136
+
137
+ regexes_us.each do |regex|
138
+ if linec =~ regex
139
+ return linec.sub( regex, '' ) # cut off leading postal code; assume rest is city
140
+ end
141
+ end
142
+ end
48
143
  else
49
- nil # unsupported country/address schema for now; sorry
144
+ # unsupported country/address schema for now; sorry
50
145
  end
146
+ return nil # sorry nothing found
147
+ end
148
+
149
+
150
+ def find_city_in_addr( address, country_key )
151
+
152
+ # fix: use blank?
153
+ return nil if address.nil? || address.empty? # do NOT process nil or empty address lines; sorry
154
+
155
+ ## try geneneric rule first (e.g. w/o postal code/zip code or state), see above
156
+ city = find_city_in_addr_without_postal_code( address )
157
+ return city unless city.nil?
158
+
159
+ city = find_city_in_addr_with_postal_code( address, country_key )
160
+ return city unless city.nil?
161
+
162
+ nil # sorry; no city found (using known patterns)
51
163
  end
52
164
 
53
165
 
@@ -49,10 +49,11 @@ module TextUtils
49
49
  end
50
50
 
51
51
  def strip_special_chars( title )
52
- # remove special chars (e.g. %°&)
52
+ # remove special chars (e.g. %°&$)
53
53
  # e.g. +Malta
54
54
  # Minerva 8:60
55
- title.gsub( /[%&°+:]/, '' )
55
+ # $Alianz$ Arena
56
+ title.gsub( /[%&°+:$]/, '' )
56
57
  end
57
58
 
58
59
  def title_to_key( title )
@@ -112,6 +113,7 @@ module TextUtils
112
113
  ['ő', 'o' ], # e.g. Győri
113
114
  ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
114
115
  ['õ', 'o' ], # e.g. Nõmme
116
+ ['ô', 'o' ], # e.g. Amazônia (pt)
115
117
  ['ø', 'o' ], # e.g. Fuglafjørdur, København
116
118
  ['ř', 'r' ], # e.g. Třeboň
117
119
  ['ș', 's' ], # e.g. Chișinău, București
@@ -1,7 +1,7 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.8.3'
4
+ VERSION = '0.8.4'
5
5
 
6
6
  end # module TextUtils
7
7
 
@@ -0,0 +1,108 @@
1
+ # encoding: utf-8
2
+
3
+
4
+ require 'helper'
5
+
6
+ class TestAddressHelper < MiniTest::Unit::TestCase
7
+
8
+ def test_normalize_addr
9
+
10
+ txt_io = [
11
+ ['Alte Plauener Straße 24 // 95028 Hof', nil, 'Alte Plauener Straße 24 // 95028 Hof'],
12
+ ['Alte Plauener Straße 24 // 95028 Hof', 'de', '95028 Hof // Alte Plauener Straße 24'],
13
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', nil, 'Mautner Markhof-Straße 11 // 2320 Schwechat'],
14
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', '2320 Schwechat // Mautner Markhof-Straße 11']
15
+ ]
16
+
17
+ txt_io.each_with_index do |txt,i|
18
+ puts "testing [#{i}] #{txt[0]}"
19
+ assert_equal txt[2], TextUtils.normalize_addr( txt[0], txt[1] )
20
+ end
21
+
22
+ end # method test_normalize_addr
23
+
24
+
25
+ def test_addr_without_postal_code # aka generic rule
26
+
27
+ txt_io = [
28
+ ['London //', 'London'],
29
+ ['// London', 'London'],
30
+ ['// London ', 'London'],
31
+ [' // London', 'London'],
32
+ ['// London, W4 2QB', nil],
33
+ ['// London | W4 2QB', nil],
34
+ ['// London W4 2QB', nil],
35
+ ['Chiswick Lane South // London, W4 2QB', nil],
36
+ ['The Griffin Brewery // Chiswick Lane South // London', nil], # three lines will NOT work, sorry
37
+ ['// New York, NY', nil],
38
+ ['// New York NY', nil] # check: does it exist in the real world (e.g. w/o comma or pipe?) support it?
39
+ ]
40
+
41
+ txt_io.each_with_index do |txt,i|
42
+ puts "testing [#{i}] #{txt[0]}"
43
+ assert_equal txt[1], TextUtils.find_city_in_addr_without_postal_code( txt[0] )
44
+ end
45
+ end # method test_addr_without_postal_code
46
+
47
+
48
+ def test_addr_with_postal_code
49
+
50
+ txt_io = [
51
+ ['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
52
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
53
+ ['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
54
+ ['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
55
+ ['2018 Antwerpen', 'be', 'Antwerpen'],
56
+ ['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
57
+ ['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
58
+ ['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
59
+ ['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
60
+ ['288 25 Nymburk', 'cz', 'Nymburk'],
61
+ ['036 42 Martin', 'sk', 'Martin'],
62
+ ['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
63
+ ['Brooklyn | NY 11249', 'us', 'Brooklyn'],
64
+ ['Brooklyn, NY 11249', 'us', 'Brooklyn'],
65
+ ['Brooklyn | NY', 'us', 'Brooklyn'],
66
+ ['Brooklyn, NY', 'us', 'Brooklyn'],
67
+ ]
68
+
69
+ txt_io.each_with_index do |txt,i|
70
+ puts "testing [#{i}] #{txt[0]}"
71
+ assert_equal txt[2], TextUtils.find_city_in_addr_with_postal_code( txt[0], txt[1] )
72
+ end
73
+ end # method test_addr_with_postal_code
74
+
75
+
76
+ def test_addr
77
+
78
+ txt_io = [
79
+ ['London //', nil, 'London'],
80
+ ['// London', nil, 'London'],
81
+ ['// London ', nil, 'London'],
82
+ [' // London', nil, 'London'],
83
+ ['2320 Schwechat // Mautner Markhof-Straße 11', 'at', 'Schwechat'],
84
+ ['Mautner Markhof-Straße 11 // 2320 Schwechat', 'at', 'Schwechat'],
85
+ ['3910 Zwettl // Syrnauer Straße 22-25', 'at', 'Zwettl'],
86
+ ['Syrnauer Straße 22-25 // 3910 Zwettl', 'at', 'Zwettl'],
87
+ ['2018 Antwerpen', 'be', 'Antwerpen'],
88
+ ['2870 Breendonk-Puurs', 'be', 'Breendonk-Puurs'],
89
+ ['Alte Plauener Straße 24 // 95028 Hof', 'de', 'Hof'],
90
+ ['95028 Hof // Alte Plauener Straße 24', 'de', 'Hof'],
91
+ ['284 15 Kutná Hora', 'cz', 'Kutná Hora'],
92
+ ['288 25 Nymburk', 'cz', 'Nymburk'],
93
+ ['036 42 Martin', 'sk', 'Martin'],
94
+ ['974 05 Banská Bystrica', 'sk', 'Banská Bystrica'],
95
+ ['Brooklyn | NY 11249', 'us', 'Brooklyn'],
96
+ ['Brooklyn, NY 11249', 'us', 'Brooklyn'],
97
+ ['Brooklyn | NY', 'us', 'Brooklyn'],
98
+ ['Brooklyn, NY', 'us', 'Brooklyn'],
99
+ ]
100
+
101
+ txt_io.each_with_index do |txt,i|
102
+ puts "testing [#{i}] #{txt[0]}"
103
+ assert_equal txt[2], TextUtils.find_city_in_addr( txt[0], txt[1] )
104
+ end
105
+ end # method test_addr
106
+
107
+
108
+ end # class TestAddressHelper
@@ -34,7 +34,9 @@ class TestTitleHelper < MiniTest::Unit::TestCase
34
34
  [ '+Lupulus', 'lupulus' ],
35
35
  [ '+Malta', 'malta' ],
36
36
  [ 'Minerva 8:60', 'minerva860' ],
37
- [ 'Hop Crisis!', 'hopcrisis' ]
37
+ [ 'Hop Crisis!', 'hopcrisis' ],
38
+ [ '$Alianz$ Arena', 'alianzarena' ],
39
+ [ 'Arena Amazônia', 'arenaamazonia' ]
38
40
  ]
39
41
 
40
42
  txt_io.each do |txt|
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.8.3
4
+ version: 0.8.4
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2014-01-23 00:00:00.000000000 Z
12
+ date: 2014-02-12 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &21298812 !ruby/object:Gem::Requirement
16
+ requirement: &20379624 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,10 +21,10 @@ dependencies:
21
21
  version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *21298812
24
+ version_requirements: *20379624
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &21298416 !ruby/object:Gem::Requirement
27
+ requirement: &20377728 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '4.0'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *21298416
35
+ version_requirements: *20377728
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &21297996 !ruby/object:Gem::Requirement
38
+ requirement: &20376972 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '3.7'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *21297996
46
+ version_requirements: *20376972
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
48
  email: ruby-talk@ruby-lang.org
49
49
  executables: []
@@ -81,6 +81,7 @@ files:
81
81
  - lib/textutils/utils.rb
82
82
  - lib/textutils/version.rb
83
83
  - test/helper.rb
84
+ - test/test_address_helper.rb
84
85
  - test/test_hypertext_helper.rb
85
86
  - test/test_title_helper.rb
86
87
  - test/test_unicode_helper.rb
@@ -114,6 +115,7 @@ signing_key:
114
115
  specification_version: 3
115
116
  summary: textutils - Text Filters, Helpers, Readers and More
116
117
  test_files:
118
+ - test/test_address_helper.rb
117
119
  - test/test_hypertext_helper.rb
118
120
  - test/test_title_helper.rb
119
121
  - test/test_unicode_helper.rb