textutils 0.5.9 → 0.5.10
Sign up to get free protection for your applications and to get access to all the features.
- data/.gemtest +0 -0
- data/Manifest.txt +5 -0
- data/lib/textutils/filter/comment_filter.rb +1 -1
- data/lib/textutils/helper/title_helper.rb +155 -0
- data/lib/textutils/helper/unicode_helper.rb +53 -0
- data/lib/textutils/reader/code_reader.rb +2 -0
- data/lib/textutils/reader/fixture_reader.rb +48 -0
- data/lib/textutils/reader/hash_reader.rb +2 -0
- data/lib/textutils/reader/line_reader.rb +3 -0
- data/lib/textutils/reader/values_reader.rb +4 -96
- data/lib/textutils/utils.rb +14 -96
- data/lib/textutils/version.rb +1 -1
- data/lib/textutils.rb +4 -1
- data/test/helper.rb +14 -0
- data/test/test_helper.rb +36 -0
- metadata +16 -9
data/.gemtest
ADDED
File without changes
|
data/Manifest.txt
CHANGED
@@ -7,9 +7,14 @@ lib/textutils/filter/code_filter.rb
|
|
7
7
|
lib/textutils/filter/comment_filter.rb
|
8
8
|
lib/textutils/filter/erb_django_filter.rb
|
9
9
|
lib/textutils/filter/erb_filter.rb
|
10
|
+
lib/textutils/helper/title_helper.rb
|
11
|
+
lib/textutils/helper/unicode_helper.rb
|
10
12
|
lib/textutils/reader/code_reader.rb
|
13
|
+
lib/textutils/reader/fixture_reader.rb
|
11
14
|
lib/textutils/reader/hash_reader.rb
|
12
15
|
lib/textutils/reader/line_reader.rb
|
13
16
|
lib/textutils/reader/values_reader.rb
|
14
17
|
lib/textutils/utils.rb
|
15
18
|
lib/textutils/version.rb
|
19
|
+
test/helper.rb
|
20
|
+
test/test_helper.rb
|
@@ -0,0 +1,155 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module TitleHelper
|
6
|
+
|
7
|
+
def title_to_key( title )
|
8
|
+
|
9
|
+
## NB: used in/moved from readers/values_reader.rb
|
10
|
+
|
11
|
+
|
12
|
+
## NB: downcase does NOT work for accented chars (thus, include in alternatives)
|
13
|
+
key = title.downcase
|
14
|
+
|
15
|
+
### remove optional english translation in square brackets ([]) e.g. Wien [Vienna]
|
16
|
+
key = key.gsub( /\[.+\]/, '' )
|
17
|
+
|
18
|
+
## remove optional longer title part in () e.g. Las Palmas (de Gran Canaria), Palma (de Mallorca)
|
19
|
+
key = key.gsub( /\(.+\)/, '' )
|
20
|
+
|
21
|
+
## remove optional longer title part in {} e.g. Ottakringer {Bio} or {Alkoholfrei}
|
22
|
+
## todo: use for autotags? e.g. {Bio} => bio
|
23
|
+
key = key.gsub( /\{.+\}/, '' )
|
24
|
+
|
25
|
+
## remove all whitespace and punctuation
|
26
|
+
key = key.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
|
27
|
+
|
28
|
+
## remove special chars (e.g. %°&)
|
29
|
+
key = key.gsub( /[%&°]/, '' )
|
30
|
+
|
31
|
+
## turn accented char into ascii look alike if possible
|
32
|
+
##
|
33
|
+
## todo: add some more
|
34
|
+
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
35
|
+
|
36
|
+
## todo: add unicode codepoint name
|
37
|
+
|
38
|
+
alternatives = [
|
39
|
+
['ß', 'ss'],
|
40
|
+
['æ', 'ae'],
|
41
|
+
['ä', 'ae'],
|
42
|
+
['ā', 'a' ], # e.g. Liepājas
|
43
|
+
['á', 'a' ], # e.g. Bogotá, Králové
|
44
|
+
['ã', 'a' ], # e.g São Paulo
|
45
|
+
['ă', 'a' ], # e.g. Chișinău
|
46
|
+
['â', 'a' ], # e.g Goiânia
|
47
|
+
['å', 'a' ], # e.g. Vålerenga
|
48
|
+
['ą', 'a' ], # e.g. Śląsk
|
49
|
+
['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
|
50
|
+
['ć', 'c' ], # e.g. Budućnost
|
51
|
+
['č', 'c' ], # e.g. Tradiční, Výčepní
|
52
|
+
['é', 'e' ], # e.g. Vélez, Králové
|
53
|
+
['è', 'e' ], # e.g. Rivières
|
54
|
+
['ê', 'e' ], # e.g. Grêmio
|
55
|
+
['ě', 'e' ], # e.g. Budějovice
|
56
|
+
['ĕ', 'e' ], # e.g. Svĕtlý
|
57
|
+
['ė', 'e' ], # e.g. Vėtra
|
58
|
+
['ë', 'e' ], # e.g. Skënderbeu
|
59
|
+
['ğ', 'g' ], # e.g. Qarabağ
|
60
|
+
['ì', 'i' ], # e.g. Potosì
|
61
|
+
['í', 'i' ], # e.g. Ústí
|
62
|
+
['ł', 'l' ], # e.g. Wisła, Wrocław
|
63
|
+
['ñ', 'n' ], # e.g. Porteño
|
64
|
+
['ň', 'n' ], # e.g. Plzeň, Třeboň
|
65
|
+
['ö', 'oe'],
|
66
|
+
['ő', 'o' ], # e.g. Győri
|
67
|
+
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
68
|
+
['õ', 'o' ], # e.g. Nõmme
|
69
|
+
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
70
|
+
['ř', 'r' ], # e.g. Třeboň
|
71
|
+
['ș', 's' ], # e.g. Chișinău, București
|
72
|
+
['ş', 's' ], # e.g. Beşiktaş
|
73
|
+
['š', 's' ], # e.g. Košice
|
74
|
+
['ť', 't' ], # e.g. Měšťan
|
75
|
+
['ü', 'ue'],
|
76
|
+
['ú', 'u' ], # e.g. Fútbol
|
77
|
+
['ū', 'u' ], # e.g. Sūduva
|
78
|
+
['ů', 'u' ], # e.g. Sládkův
|
79
|
+
['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
|
80
|
+
['ý', 'y' ], # e.g. Nefitrovaný
|
81
|
+
['ź', 'z' ], # e.g. Łódź
|
82
|
+
['ž', 'z' ], # e.g. Domžale, Petržalka
|
83
|
+
|
84
|
+
['Č', 'c' ], # e.g. České
|
85
|
+
['İ', 'i' ], # e.g. İnter
|
86
|
+
['Í', 'i' ], # e.g. ÍBV
|
87
|
+
['Ł', 'l' ], # e.g. Łódź
|
88
|
+
['Ö', 'oe' ], # e.g. Örebro
|
89
|
+
['Ř', 'r' ], # e.g. Řezák
|
90
|
+
['Ś', 's' ], # e.g. Śląsk
|
91
|
+
['Š', 's' ], # e.g. MŠK
|
92
|
+
['Ş', 's' ], # e.g. Şüvälan
|
93
|
+
['Ú', 'u' ], # e.g. Ústí, Újpest
|
94
|
+
['Ž', 'z' ] # e.g. Žilina
|
95
|
+
]
|
96
|
+
|
97
|
+
alternatives.each do |alt|
|
98
|
+
key = key.gsub( alt[0], alt[1] )
|
99
|
+
end
|
100
|
+
|
101
|
+
key
|
102
|
+
end # method title_to_key
|
103
|
+
|
104
|
+
|
105
|
+
def title_esc_regex( title_unescaped )
|
106
|
+
|
107
|
+
## escape regex special chars e.g. . to \. and ( to \( etc.
|
108
|
+
# e.g. Benfica Lis.
|
109
|
+
# e.g. Club Atlético Colón (Santa Fe)
|
110
|
+
|
111
|
+
## NB: cannot use Regexp.escape! will escape space '' to '\ '
|
112
|
+
## title = Regexp.escape( title_unescaped )
|
113
|
+
title = title_unescaped.gsub( '.', '\.' )
|
114
|
+
title = title.gsub( '(', '\(' )
|
115
|
+
title = title.gsub( ')', '\)' )
|
116
|
+
|
117
|
+
## match accented char with or without accents
|
118
|
+
## add (ü|ue) etc.
|
119
|
+
## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
|
120
|
+
|
121
|
+
## todo: add some more
|
122
|
+
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
123
|
+
##
|
124
|
+
## reuse for all readers!
|
125
|
+
|
126
|
+
alternatives = [
|
127
|
+
['-', '(-| )'], ## e.g. Blau-Weiß Linz
|
128
|
+
['æ', '(æ|ae)'], ## e.g.
|
129
|
+
['á', '(á|a)'], ## e.g. Bogotá, Sársfield
|
130
|
+
['ã', '(ã|a)'], ## e.g São Paulo
|
131
|
+
['ä', '(ä|ae)'], ## e.g.
|
132
|
+
['ç', '(ç|c)'], ## e.g. Fenerbahçe
|
133
|
+
['é', '(é|e)'], ## e.g. Vélez
|
134
|
+
['ê', '(ê|e)'], ## e.g. Grêmio
|
135
|
+
['ñ', '(ñ|n)'], ## e.g. Porteño
|
136
|
+
['ň', '(ň|n)'], ## e.g. Plzeň
|
137
|
+
['Ö', '(Ö|Oe)'], ## e.g. Österreich
|
138
|
+
['ö', '(ö|oe)'], ## e.g. Mönchengladbach
|
139
|
+
['ó', '(ó|o)'], ## e.g. Colón
|
140
|
+
['ș', '(ș|s)'], ## e.g. Bucarești
|
141
|
+
['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
|
142
|
+
['ü', '(ü|ue)'], ## e.g.
|
143
|
+
['ú', '(ú|u)'] ## e.g. Fútbol
|
144
|
+
]
|
145
|
+
|
146
|
+
alternatives.each do |alt|
|
147
|
+
title = title.gsub( alt[0], alt[1] )
|
148
|
+
end
|
149
|
+
|
150
|
+
title
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
end # module TitleHelper
|
155
|
+
end # module TextUtils
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module UnicodeHelper
|
6
|
+
|
7
|
+
# NB:
|
8
|
+
# U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -
|
9
|
+
#
|
10
|
+
# see en.wikipedia.org/wiki/Dash
|
11
|
+
|
12
|
+
U_HYPHEN = "\u2010" # unambigous hyphen
|
13
|
+
U_NON_BREAKING_HYPHEN = "\u2011" # unambigous non-breaking hyphen
|
14
|
+
U_MINUS = "\u2212" # unambigous minus sign (html => −)
|
15
|
+
U_NDASH = "\u2013" # ndash (html => – ascii => --)
|
16
|
+
U_MDASH = "\u2014" # mdash (html => — ascii => ---)
|
17
|
+
|
18
|
+
def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
|
19
|
+
|
20
|
+
text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|
|
21
|
+
|
22
|
+
# puts "found U+#{'%04X' % $1.ord} (#{$1})"
|
23
|
+
|
24
|
+
msg = ''
|
25
|
+
|
26
|
+
if $1 == U_HYPHEN
|
27
|
+
msg << "found hyhpen U+2010 (#{$1})"
|
28
|
+
elsif $1 == U_NON_BREAKING_HYPHEN
|
29
|
+
msg << "found non_breaking_hyhpen U+2011 (#{$1})"
|
30
|
+
elsif $1 == U_MINUS
|
31
|
+
msg << "found minus U+2212 (#{$1})"
|
32
|
+
elsif $1 == U_NDASH
|
33
|
+
msg << "found ndash U+2013 (#{$1})"
|
34
|
+
elsif $1 == U_MDASH
|
35
|
+
msg << "found mdash U+2014 (#{$1})"
|
36
|
+
else
|
37
|
+
msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
|
38
|
+
end
|
39
|
+
|
40
|
+
msg << " in file >#{opts[:path]}<" if opts[:path]
|
41
|
+
msg << "; converting to plain ascii hyphen_minus (-)"
|
42
|
+
|
43
|
+
puts "*** warning: #{msg}"
|
44
|
+
|
45
|
+
'-'
|
46
|
+
end
|
47
|
+
|
48
|
+
text
|
49
|
+
end # method convert_unicode_dashes_to_plain_ascii
|
50
|
+
|
51
|
+
|
52
|
+
end # module UnicodeHelper
|
53
|
+
end # module TextUtils
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
### read in a list of fixtures (that is, fixture names/files)
|
4
|
+
|
5
|
+
# fix: move into TextUtils namespace/module!!
|
6
|
+
|
7
|
+
class FixtureReader
|
8
|
+
|
9
|
+
include LogUtils::Logging
|
10
|
+
|
11
|
+
def initialize( path )
|
12
|
+
@path = path
|
13
|
+
|
14
|
+
## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
|
15
|
+
## - see textutils/utils.rb
|
16
|
+
text = File.read_utf8( @path )
|
17
|
+
|
18
|
+
hash = YAML.load( text )
|
19
|
+
|
20
|
+
### build up array for fixtures from hash
|
21
|
+
|
22
|
+
@ary = []
|
23
|
+
|
24
|
+
hash.each do |key_wild, value_wild|
|
25
|
+
key = key_wild.to_s.strip
|
26
|
+
|
27
|
+
logger.debug "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value_wild}<<"
|
28
|
+
|
29
|
+
if value_wild.kind_of?( String ) # assume single fixture name
|
30
|
+
@ary << value_wild
|
31
|
+
elsif value_wild.kind_of?( Array ) # assume array of fixture names as strings
|
32
|
+
@ary = ary + value_wild
|
33
|
+
else
|
34
|
+
logger.error "unknow fixture type in setup (yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value_wild}<<); skipping"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
logger.debug "fixture setup:"
|
39
|
+
logger.debug @ary.to_json
|
40
|
+
end
|
41
|
+
|
42
|
+
def each
|
43
|
+
@ary.each do |fixture|
|
44
|
+
yield( fixture )
|
45
|
+
end
|
46
|
+
end # method each
|
47
|
+
|
48
|
+
end # class FixtureReader
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
# fix: move into TextUtils namespace/module!!
|
4
|
+
|
3
5
|
class ValuesReader
|
4
6
|
|
5
7
|
include LogUtils::Logging
|
@@ -150,7 +152,7 @@ class ValuesReader
|
|
150
152
|
|
151
153
|
if key_col == '<auto>'
|
152
154
|
## autogenerate key from first title
|
153
|
-
key_col = title_to_key( titles[0] )
|
155
|
+
key_col = TextUtils.title_to_key( titles[0] )
|
154
156
|
logger.debug " autogen key >#{key_col}< from title >#{titles[0]}<, textutils version #{TextUtils::VERSION}"
|
155
157
|
end
|
156
158
|
|
@@ -273,7 +275,7 @@ class ValuesReader
|
|
273
275
|
|
274
276
|
if key_col == '<auto>'
|
275
277
|
## autogenerate key from first title
|
276
|
-
key_col = title_to_key( titles[0] )
|
278
|
+
key_col = TextUtils.title_to_key( titles[0] )
|
277
279
|
logger.debug " autogen key >#{key_col}< from title >#{titles[0]}<, textutils version #{TextUtils::VERSION}"
|
278
280
|
end
|
279
281
|
|
@@ -286,101 +288,7 @@ class ValuesReader
|
|
286
288
|
end # each lines
|
287
289
|
|
288
290
|
end # method each_line
|
289
|
-
|
290
|
-
|
291
291
|
|
292
|
-
def title_to_key( title )
|
293
|
-
|
294
|
-
## NB: downcase does NOT work for accented chars (thus, include in alternatives)
|
295
|
-
key = title.downcase
|
296
|
-
|
297
|
-
### remove optional english translation in square brackets ([]) e.g. Wien [Vienna]
|
298
|
-
key = key.gsub( /\[.+\]/, '' )
|
299
|
-
|
300
|
-
## remove optional longer title part in () e.g. Las Palmas (de Gran Canaria), Palma (de Mallorca)
|
301
|
-
key = key.gsub( /\(.+\)/, '' )
|
302
|
-
|
303
|
-
## remove optional longer title part in {} e.g. Ottakringer {Bio} or {Alkoholfrei}
|
304
|
-
## todo: use for autotags? e.g. {Bio} => bio
|
305
|
-
key = key.gsub( /\{.+\}/, '' )
|
306
|
-
|
307
|
-
## remove all whitespace and punctuation
|
308
|
-
key = key.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
|
309
|
-
|
310
|
-
## remove special chars (e.g. %°&)
|
311
|
-
key = key.gsub( /[%&°]/, '' )
|
312
|
-
|
313
|
-
## turn accented char into ascii look alike if possible
|
314
|
-
##
|
315
|
-
## todo: add some more
|
316
|
-
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
317
|
-
|
318
|
-
## todo: add unicode codepoint name
|
319
|
-
|
320
|
-
alternatives = [
|
321
|
-
['ß', 'ss'],
|
322
|
-
['æ', 'ae'],
|
323
|
-
['ä', 'ae'],
|
324
|
-
['ā', 'a' ], # e.g. Liepājas
|
325
|
-
['á', 'a' ], # e.g. Bogotá, Králové
|
326
|
-
['ã', 'a' ], # e.g São Paulo
|
327
|
-
['ă', 'a' ], # e.g. Chișinău
|
328
|
-
['â', 'a' ], # e.g Goiânia
|
329
|
-
['å', 'a' ], # e.g. Vålerenga
|
330
|
-
['ą', 'a' ], # e.g. Śląsk
|
331
|
-
['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
|
332
|
-
['ć', 'c' ], # e.g. Budućnost
|
333
|
-
['č', 'c' ], # e.g. Tradiční, Výčepní
|
334
|
-
['é', 'e' ], # e.g. Vélez, Králové
|
335
|
-
['è', 'e' ], # e.g. Rivières
|
336
|
-
['ê', 'e' ], # e.g. Grêmio
|
337
|
-
['ě', 'e' ], # e.g. Budějovice
|
338
|
-
['ĕ', 'e' ], # e.g. Svĕtlý
|
339
|
-
['ė', 'e' ], # e.g. Vėtra
|
340
|
-
['ë', 'e' ], # e.g. Skënderbeu
|
341
|
-
['ğ', 'g' ], # e.g. Qarabağ
|
342
|
-
['ì', 'i' ], # e.g. Potosì
|
343
|
-
['í', 'i' ], # e.g. Ústí
|
344
|
-
['ł', 'l' ], # e.g. Wisła, Wrocław
|
345
|
-
['ñ', 'n' ], # e.g. Porteño
|
346
|
-
['ň', 'n' ], # e.g. Plzeň, Třeboň
|
347
|
-
['ö', 'oe'],
|
348
|
-
['ő', 'o' ], # e.g. Győri
|
349
|
-
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
350
|
-
['õ', 'o' ], # e.g. Nõmme
|
351
|
-
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
352
|
-
['ř', 'r' ], # e.g. Třeboň
|
353
|
-
['ș', 's' ], # e.g. Chișinău, București
|
354
|
-
['ş', 's' ], # e.g. Beşiktaş
|
355
|
-
['š', 's' ], # e.g. Košice
|
356
|
-
['ť', 't' ], # e.g. Měšťan
|
357
|
-
['ü', 'ue'],
|
358
|
-
['ú', 'u' ], # e.g. Fútbol
|
359
|
-
['ū', 'u' ], # e.g. Sūduva
|
360
|
-
['ů', 'u' ], # e.g. Sládkův
|
361
|
-
['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
|
362
|
-
['ý', 'y' ], # e.g. Nefitrovaný
|
363
|
-
['ź', 'z' ], # e.g. Łódź
|
364
|
-
['ž', 'z' ], # e.g. Domžale, Petržalka
|
365
|
-
|
366
|
-
['Č', 'c' ], # e.g. České
|
367
|
-
['İ', 'i' ], # e.g. İnter
|
368
|
-
['Í', 'i' ], # e.g. ÍBV
|
369
|
-
['Ł', 'l' ], # e.g. Łódź
|
370
|
-
['Ö', 'oe' ], # e.g. Örebro
|
371
|
-
['Ř', 'r' ], # e.g. Řezák
|
372
|
-
['Ś', 's' ], # e.g. Śląsk
|
373
|
-
['Š', 's' ], # e.g. MŠK
|
374
|
-
['Ş', 's' ], # e.g. Şüvälan
|
375
|
-
['Ú', 'u' ], # e.g. Ústí, Újpest
|
376
|
-
['Ž', 'z' ] # e.g. Žilina
|
377
|
-
]
|
378
|
-
|
379
|
-
alternatives.each do |alt|
|
380
|
-
key = key.gsub( alt[0], alt[1] )
|
381
|
-
end
|
382
292
|
|
383
|
-
key
|
384
|
-
end # method title_to_key
|
385
293
|
|
386
294
|
end # class ValuesReader
|
data/lib/textutils/utils.rb
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
|
4
|
+
|
5
|
+
module TextUtils
|
6
|
+
# make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
|
7
|
+
extend UnicodeHelper
|
8
|
+
extend TitleHelper
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
|
4
13
|
class File
|
5
14
|
def self.read_utf8( path )
|
6
15
|
text = open( path, 'r:bom|utf-8' ) do |file|
|
@@ -8,107 +17,16 @@ class File
|
|
8
17
|
end
|
9
18
|
|
10
19
|
# NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
|
11
|
-
text = convert_unicode_dashes_to_plain_ascii( text, path: path )
|
20
|
+
text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
|
12
21
|
|
13
22
|
text
|
14
23
|
end
|
15
24
|
end # class File
|
16
25
|
|
17
26
|
|
18
|
-
# NB:
|
19
|
-
# U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -
|
20
|
-
#
|
21
|
-
# see en.wikipedia.org/wiki/Dash
|
22
|
-
|
23
|
-
U_HYPHEN = "\u2010" # unambigous hyphen
|
24
|
-
U_NON_BREAKING_HYPHEN = "\u2011" # unambigous non-breaking hyphen
|
25
|
-
U_MINUS = "\u2212" # unambigous minus sign (html => −)
|
26
|
-
U_NDASH = "\u2013" # ndash (html => – ascii => --)
|
27
|
-
U_MDASH = "\u2014" # mdash (html => — ascii => ---)
|
28
|
-
|
29
|
-
|
30
|
-
def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
|
31
|
-
|
32
|
-
text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|
|
33
|
-
|
34
|
-
# puts "found U+#{'%04X' % $1.ord} (#{$1})"
|
35
|
-
|
36
|
-
msg = ''
|
37
|
-
|
38
|
-
if $1 == U_HYPHEN
|
39
|
-
msg << "found hyhpen U+2010 (#{$1})"
|
40
|
-
elsif $1 == U_NON_BREAKING_HYPHEN
|
41
|
-
msg << "found non_breaking_hyhpen U+2011 (#{$1})"
|
42
|
-
elsif $1 == U_MINUS
|
43
|
-
msg << "found minus U+2212 (#{$1})"
|
44
|
-
elsif $1 == U_NDASH
|
45
|
-
msg << "found ndash U+2013 (#{$1})"
|
46
|
-
elsif $1 == U_MDASH
|
47
|
-
msg << "found mdash U+2014 (#{$1})"
|
48
|
-
else
|
49
|
-
msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
|
50
|
-
end
|
51
27
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
28
|
+
def title_esc_regex( title_unescaped )
|
29
|
+
puts "*** warn: depreceated fn call: use TextUtils.title_esc_regex() or include TextUtils::TitleHelpers"
|
30
|
+
TextUtils.title_esc_regex( title_unescaped )
|
31
|
+
end
|
56
32
|
|
57
|
-
'-'
|
58
|
-
end
|
59
|
-
|
60
|
-
text
|
61
|
-
end # method convert_unicode_dashes_to_plain_ascii
|
62
|
-
|
63
|
-
|
64
|
-
############
|
65
|
-
### fix/todo: share helper for all text readers/parsers- where to put it?
|
66
|
-
###
|
67
|
-
|
68
|
-
def title_esc_regex( title_unescaped )
|
69
|
-
|
70
|
-
## escape regex special chars e.g. . to \. and ( to \( etc.
|
71
|
-
# e.g. Benfica Lis.
|
72
|
-
# e.g. Club Atlético Colón (Santa Fe)
|
73
|
-
|
74
|
-
## NB: cannot use Regexp.escape! will escape space '' to '\ '
|
75
|
-
## title = Regexp.escape( title_unescaped )
|
76
|
-
title = title_unescaped.gsub( '.', '\.' )
|
77
|
-
title = title.gsub( '(', '\(' )
|
78
|
-
title = title.gsub( ')', '\)' )
|
79
|
-
|
80
|
-
## match accented char with or without accents
|
81
|
-
## add (ü|ue) etc.
|
82
|
-
## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
|
83
|
-
|
84
|
-
## todo: add some more
|
85
|
-
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
86
|
-
##
|
87
|
-
## reuse for all readers!
|
88
|
-
|
89
|
-
alternatives = [
|
90
|
-
['-', '(-| )'], ## e.g. Blau-Weiß Linz
|
91
|
-
['æ', '(æ|ae)'], ## e.g.
|
92
|
-
['á', '(á|a)'], ## e.g. Bogotá, Sársfield
|
93
|
-
['ã', '(ã|a)'], ## e.g São Paulo
|
94
|
-
['ä', '(ä|ae)'], ## e.g.
|
95
|
-
['ç', '(ç|c)'], ## e.g. Fenerbahçe
|
96
|
-
['é', '(é|e)'], ## e.g. Vélez
|
97
|
-
['ê', '(ê|e)'], ## e.g. Grêmio
|
98
|
-
['ñ', '(ñ|n)'], ## e.g. Porteño
|
99
|
-
['ň', '(ň|n)'], ## e.g. Plzeň
|
100
|
-
['Ö', '(Ö|Oe)'], ## e.g. Österreich
|
101
|
-
['ö', '(ö|oe)'], ## e.g. Mönchengladbach
|
102
|
-
['ó', '(ó|o)'], ## e.g. Colón
|
103
|
-
['ș', '(ș|s)'], ## e.g. Bucarești
|
104
|
-
['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
|
105
|
-
['ü', '(ü|ue)'], ## e.g.
|
106
|
-
['ú', '(ú|u)'] ## e.g. Fútbol
|
107
|
-
]
|
108
|
-
|
109
|
-
alternatives.each do |alt|
|
110
|
-
title = title.gsub( alt[0], alt[1] )
|
111
|
-
end
|
112
|
-
|
113
|
-
title
|
114
|
-
end
|
data/lib/textutils/version.rb
CHANGED
data/lib/textutils.rb
CHANGED
@@ -21,10 +21,13 @@ require 'textutils/filter/comment_filter'
|
|
21
21
|
require 'textutils/filter/erb_django_filter'
|
22
22
|
require 'textutils/filter/erb_filter'
|
23
23
|
|
24
|
+
require 'textutils/helper/unicode_helper'
|
25
|
+
require 'textutils/helper/title_helper'
|
26
|
+
|
24
27
|
require 'textutils/utils'
|
25
28
|
require 'textutils/reader/code_reader'
|
26
29
|
require 'textutils/reader/hash_reader'
|
27
30
|
require 'textutils/reader/line_reader'
|
28
31
|
require 'textutils/reader/values_reader'
|
29
|
-
|
32
|
+
require 'textutils/reader/fixture_reader'
|
30
33
|
|
data/test/helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
|
2
|
+
## $:.unshift(File.dirname(__FILE__))
|
3
|
+
|
4
|
+
## minitest setup
|
5
|
+
|
6
|
+
# require 'minitest/unit'
|
7
|
+
require 'minitest/autorun'
|
8
|
+
|
9
|
+
# include MiniTest::Unit # lets us use TestCase instead of MiniTest::Unit::TestCase
|
10
|
+
|
11
|
+
|
12
|
+
## our own code
|
13
|
+
|
14
|
+
require 'textutils'
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_helper.rb
|
6
|
+
# or better
|
7
|
+
# rake test
|
8
|
+
|
9
|
+
require 'helper'
|
10
|
+
|
11
|
+
class TestHelper < MiniTest::Unit::TestCase
|
12
|
+
|
13
|
+
def test_convert_unicode_dashes
|
14
|
+
|
15
|
+
txt_in = "\u2010 \u2011 \u2212 \u2013 \u2014" # NB: unicode chars require double quoted strings
|
16
|
+
txt_out = '- - - - -'
|
17
|
+
|
18
|
+
assert( txt_out == TextUtils.convert_unicode_dashes_to_plain_ascii( txt_in ) )
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def test_title_to_key
|
23
|
+
|
24
|
+
txt_io = [
|
25
|
+
[ 'São Paulo', 'saopaulo' ],
|
26
|
+
[ 'São Gonçalo', 'saogoncalo' ],
|
27
|
+
[ 'Výčepní', 'vycepni' ]
|
28
|
+
]
|
29
|
+
|
30
|
+
txt_io.each do |txt|
|
31
|
+
assert( txt[1] == TextUtils.title_to_key( txt[0] ) )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
end # class TestHelper
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &72786300 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *72786300
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &72786080 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '3.10'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *72786080
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &72785860 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '3.3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *72785860
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
48
|
email: webslideshow@googlegroups.com
|
49
49
|
executables: []
|
@@ -60,12 +60,18 @@ files:
|
|
60
60
|
- lib/textutils/filter/comment_filter.rb
|
61
61
|
- lib/textutils/filter/erb_django_filter.rb
|
62
62
|
- lib/textutils/filter/erb_filter.rb
|
63
|
+
- lib/textutils/helper/title_helper.rb
|
64
|
+
- lib/textutils/helper/unicode_helper.rb
|
63
65
|
- lib/textutils/reader/code_reader.rb
|
66
|
+
- lib/textutils/reader/fixture_reader.rb
|
64
67
|
- lib/textutils/reader/hash_reader.rb
|
65
68
|
- lib/textutils/reader/line_reader.rb
|
66
69
|
- lib/textutils/reader/values_reader.rb
|
67
70
|
- lib/textutils/utils.rb
|
68
71
|
- lib/textutils/version.rb
|
72
|
+
- test/helper.rb
|
73
|
+
- test/test_helper.rb
|
74
|
+
- .gemtest
|
69
75
|
homepage: http://geraldb.github.com/textutils
|
70
76
|
licenses:
|
71
77
|
- Public Domain
|
@@ -93,4 +99,5 @@ rubygems_version: 1.8.17
|
|
93
99
|
signing_key:
|
94
100
|
specification_version: 3
|
95
101
|
summary: textutils - Text Filters, Helpers, Readers and More
|
96
|
-
test_files:
|
102
|
+
test_files:
|
103
|
+
- test/test_helper.rb
|