textutils 0.5.9 → 0.5.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gemtest +0 -0
- data/Manifest.txt +5 -0
- data/lib/textutils/filter/comment_filter.rb +1 -1
- data/lib/textutils/helper/title_helper.rb +155 -0
- data/lib/textutils/helper/unicode_helper.rb +53 -0
- data/lib/textutils/reader/code_reader.rb +2 -0
- data/lib/textutils/reader/fixture_reader.rb +48 -0
- data/lib/textutils/reader/hash_reader.rb +2 -0
- data/lib/textutils/reader/line_reader.rb +3 -0
- data/lib/textutils/reader/values_reader.rb +4 -96
- data/lib/textutils/utils.rb +14 -96
- data/lib/textutils/version.rb +1 -1
- data/lib/textutils.rb +4 -1
- data/test/helper.rb +14 -0
- data/test/test_helper.rb +36 -0
- metadata +16 -9
data/.gemtest
ADDED
File without changes
|
data/Manifest.txt
CHANGED
@@ -7,9 +7,14 @@ lib/textutils/filter/code_filter.rb
|
|
7
7
|
lib/textutils/filter/comment_filter.rb
|
8
8
|
lib/textutils/filter/erb_django_filter.rb
|
9
9
|
lib/textutils/filter/erb_filter.rb
|
10
|
+
lib/textutils/helper/title_helper.rb
|
11
|
+
lib/textutils/helper/unicode_helper.rb
|
10
12
|
lib/textutils/reader/code_reader.rb
|
13
|
+
lib/textutils/reader/fixture_reader.rb
|
11
14
|
lib/textutils/reader/hash_reader.rb
|
12
15
|
lib/textutils/reader/line_reader.rb
|
13
16
|
lib/textutils/reader/values_reader.rb
|
14
17
|
lib/textutils/utils.rb
|
15
18
|
lib/textutils/version.rb
|
19
|
+
test/helper.rb
|
20
|
+
test/test_helper.rb
|
@@ -0,0 +1,155 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module TitleHelper
|
6
|
+
|
7
|
+
def title_to_key( title )
|
8
|
+
|
9
|
+
## NB: used in/moved from readers/values_reader.rb
|
10
|
+
|
11
|
+
|
12
|
+
## NB: downcase does NOT work for accented chars (thus, include in alternatives)
|
13
|
+
key = title.downcase
|
14
|
+
|
15
|
+
### remove optional english translation in square brackets ([]) e.g. Wien [Vienna]
|
16
|
+
key = key.gsub( /\[.+\]/, '' )
|
17
|
+
|
18
|
+
## remove optional longer title part in () e.g. Las Palmas (de Gran Canaria), Palma (de Mallorca)
|
19
|
+
key = key.gsub( /\(.+\)/, '' )
|
20
|
+
|
21
|
+
## remove optional longer title part in {} e.g. Ottakringer {Bio} or {Alkoholfrei}
|
22
|
+
## todo: use for autotags? e.g. {Bio} => bio
|
23
|
+
key = key.gsub( /\{.+\}/, '' )
|
24
|
+
|
25
|
+
## remove all whitespace and punctuation
|
26
|
+
key = key.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
|
27
|
+
|
28
|
+
## remove special chars (e.g. %°&)
|
29
|
+
key = key.gsub( /[%&°]/, '' )
|
30
|
+
|
31
|
+
## turn accented char into ascii look alike if possible
|
32
|
+
##
|
33
|
+
## todo: add some more
|
34
|
+
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
35
|
+
|
36
|
+
## todo: add unicode codepoint name
|
37
|
+
|
38
|
+
alternatives = [
|
39
|
+
['ß', 'ss'],
|
40
|
+
['æ', 'ae'],
|
41
|
+
['ä', 'ae'],
|
42
|
+
['ā', 'a' ], # e.g. Liepājas
|
43
|
+
['á', 'a' ], # e.g. Bogotá, Králové
|
44
|
+
['ã', 'a' ], # e.g São Paulo
|
45
|
+
['ă', 'a' ], # e.g. Chișinău
|
46
|
+
['â', 'a' ], # e.g Goiânia
|
47
|
+
['å', 'a' ], # e.g. Vålerenga
|
48
|
+
['ą', 'a' ], # e.g. Śląsk
|
49
|
+
['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
|
50
|
+
['ć', 'c' ], # e.g. Budućnost
|
51
|
+
['č', 'c' ], # e.g. Tradiční, Výčepní
|
52
|
+
['é', 'e' ], # e.g. Vélez, Králové
|
53
|
+
['è', 'e' ], # e.g. Rivières
|
54
|
+
['ê', 'e' ], # e.g. Grêmio
|
55
|
+
['ě', 'e' ], # e.g. Budějovice
|
56
|
+
['ĕ', 'e' ], # e.g. Svĕtlý
|
57
|
+
['ė', 'e' ], # e.g. Vėtra
|
58
|
+
['ë', 'e' ], # e.g. Skënderbeu
|
59
|
+
['ğ', 'g' ], # e.g. Qarabağ
|
60
|
+
['ì', 'i' ], # e.g. Potosì
|
61
|
+
['í', 'i' ], # e.g. Ústí
|
62
|
+
['ł', 'l' ], # e.g. Wisła, Wrocław
|
63
|
+
['ñ', 'n' ], # e.g. Porteño
|
64
|
+
['ň', 'n' ], # e.g. Plzeň, Třeboň
|
65
|
+
['ö', 'oe'],
|
66
|
+
['ő', 'o' ], # e.g. Győri
|
67
|
+
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
68
|
+
['õ', 'o' ], # e.g. Nõmme
|
69
|
+
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
70
|
+
['ř', 'r' ], # e.g. Třeboň
|
71
|
+
['ș', 's' ], # e.g. Chișinău, București
|
72
|
+
['ş', 's' ], # e.g. Beşiktaş
|
73
|
+
['š', 's' ], # e.g. Košice
|
74
|
+
['ť', 't' ], # e.g. Měšťan
|
75
|
+
['ü', 'ue'],
|
76
|
+
['ú', 'u' ], # e.g. Fútbol
|
77
|
+
['ū', 'u' ], # e.g. Sūduva
|
78
|
+
['ů', 'u' ], # e.g. Sládkův
|
79
|
+
['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
|
80
|
+
['ý', 'y' ], # e.g. Nefitrovaný
|
81
|
+
['ź', 'z' ], # e.g. Łódź
|
82
|
+
['ž', 'z' ], # e.g. Domžale, Petržalka
|
83
|
+
|
84
|
+
['Č', 'c' ], # e.g. České
|
85
|
+
['İ', 'i' ], # e.g. İnter
|
86
|
+
['Í', 'i' ], # e.g. ÍBV
|
87
|
+
['Ł', 'l' ], # e.g. Łódź
|
88
|
+
['Ö', 'oe' ], # e.g. Örebro
|
89
|
+
['Ř', 'r' ], # e.g. Řezák
|
90
|
+
['Ś', 's' ], # e.g. Śląsk
|
91
|
+
['Š', 's' ], # e.g. MŠK
|
92
|
+
['Ş', 's' ], # e.g. Şüvälan
|
93
|
+
['Ú', 'u' ], # e.g. Ústí, Újpest
|
94
|
+
['Ž', 'z' ] # e.g. Žilina
|
95
|
+
]
|
96
|
+
|
97
|
+
alternatives.each do |alt|
|
98
|
+
key = key.gsub( alt[0], alt[1] )
|
99
|
+
end
|
100
|
+
|
101
|
+
key
|
102
|
+
end # method title_to_key
|
103
|
+
|
104
|
+
|
105
|
+
def title_esc_regex( title_unescaped )
|
106
|
+
|
107
|
+
## escape regex special chars e.g. . to \. and ( to \( etc.
|
108
|
+
# e.g. Benfica Lis.
|
109
|
+
# e.g. Club Atlético Colón (Santa Fe)
|
110
|
+
|
111
|
+
## NB: cannot use Regexp.escape! will escape space '' to '\ '
|
112
|
+
## title = Regexp.escape( title_unescaped )
|
113
|
+
title = title_unescaped.gsub( '.', '\.' )
|
114
|
+
title = title.gsub( '(', '\(' )
|
115
|
+
title = title.gsub( ')', '\)' )
|
116
|
+
|
117
|
+
## match accented char with or without accents
|
118
|
+
## add (ü|ue) etc.
|
119
|
+
## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
|
120
|
+
|
121
|
+
## todo: add some more
|
122
|
+
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
123
|
+
##
|
124
|
+
## reuse for all readers!
|
125
|
+
|
126
|
+
alternatives = [
|
127
|
+
['-', '(-| )'], ## e.g. Blau-Weiß Linz
|
128
|
+
['æ', '(æ|ae)'], ## e.g.
|
129
|
+
['á', '(á|a)'], ## e.g. Bogotá, Sársfield
|
130
|
+
['ã', '(ã|a)'], ## e.g São Paulo
|
131
|
+
['ä', '(ä|ae)'], ## e.g.
|
132
|
+
['ç', '(ç|c)'], ## e.g. Fenerbahçe
|
133
|
+
['é', '(é|e)'], ## e.g. Vélez
|
134
|
+
['ê', '(ê|e)'], ## e.g. Grêmio
|
135
|
+
['ñ', '(ñ|n)'], ## e.g. Porteño
|
136
|
+
['ň', '(ň|n)'], ## e.g. Plzeň
|
137
|
+
['Ö', '(Ö|Oe)'], ## e.g. Österreich
|
138
|
+
['ö', '(ö|oe)'], ## e.g. Mönchengladbach
|
139
|
+
['ó', '(ó|o)'], ## e.g. Colón
|
140
|
+
['ș', '(ș|s)'], ## e.g. Bucarești
|
141
|
+
['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
|
142
|
+
['ü', '(ü|ue)'], ## e.g.
|
143
|
+
['ú', '(ú|u)'] ## e.g. Fútbol
|
144
|
+
]
|
145
|
+
|
146
|
+
alternatives.each do |alt|
|
147
|
+
title = title.gsub( alt[0], alt[1] )
|
148
|
+
end
|
149
|
+
|
150
|
+
title
|
151
|
+
end
|
152
|
+
|
153
|
+
|
154
|
+
end # module TitleHelper
|
155
|
+
end # module TextUtils
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
|
4
|
+
module TextUtils
|
5
|
+
module UnicodeHelper
|
6
|
+
|
7
|
+
# NB:
|
8
|
+
# U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -
|
9
|
+
#
|
10
|
+
# see en.wikipedia.org/wiki/Dash
|
11
|
+
|
12
|
+
U_HYPHEN = "\u2010" # unambigous hyphen
|
13
|
+
U_NON_BREAKING_HYPHEN = "\u2011" # unambigous non-breaking hyphen
|
14
|
+
U_MINUS = "\u2212" # unambigous minus sign (html => −)
|
15
|
+
U_NDASH = "\u2013" # ndash (html => – ascii => --)
|
16
|
+
U_MDASH = "\u2014" # mdash (html => — ascii => ---)
|
17
|
+
|
18
|
+
def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
|
19
|
+
|
20
|
+
text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|
|
21
|
+
|
22
|
+
# puts "found U+#{'%04X' % $1.ord} (#{$1})"
|
23
|
+
|
24
|
+
msg = ''
|
25
|
+
|
26
|
+
if $1 == U_HYPHEN
|
27
|
+
msg << "found hyhpen U+2010 (#{$1})"
|
28
|
+
elsif $1 == U_NON_BREAKING_HYPHEN
|
29
|
+
msg << "found non_breaking_hyhpen U+2011 (#{$1})"
|
30
|
+
elsif $1 == U_MINUS
|
31
|
+
msg << "found minus U+2212 (#{$1})"
|
32
|
+
elsif $1 == U_NDASH
|
33
|
+
msg << "found ndash U+2013 (#{$1})"
|
34
|
+
elsif $1 == U_MDASH
|
35
|
+
msg << "found mdash U+2014 (#{$1})"
|
36
|
+
else
|
37
|
+
msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
|
38
|
+
end
|
39
|
+
|
40
|
+
msg << " in file >#{opts[:path]}<" if opts[:path]
|
41
|
+
msg << "; converting to plain ascii hyphen_minus (-)"
|
42
|
+
|
43
|
+
puts "*** warning: #{msg}"
|
44
|
+
|
45
|
+
'-'
|
46
|
+
end
|
47
|
+
|
48
|
+
text
|
49
|
+
end # method convert_unicode_dashes_to_plain_ascii
|
50
|
+
|
51
|
+
|
52
|
+
end # module UnicodeHelper
|
53
|
+
end # module TextUtils
|
@@ -0,0 +1,48 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
### read in a list of fixtures (that is, fixture names/files)
|
4
|
+
|
5
|
+
# fix: move into TextUtils namespace/module!!
|
6
|
+
|
7
|
+
class FixtureReader
|
8
|
+
|
9
|
+
include LogUtils::Logging
|
10
|
+
|
11
|
+
def initialize( path )
|
12
|
+
@path = path
|
13
|
+
|
14
|
+
## nb: assume/enfore utf-8 encoding (with or without BOM - byte order mark)
|
15
|
+
## - see textutils/utils.rb
|
16
|
+
text = File.read_utf8( @path )
|
17
|
+
|
18
|
+
hash = YAML.load( text )
|
19
|
+
|
20
|
+
### build up array for fixtures from hash
|
21
|
+
|
22
|
+
@ary = []
|
23
|
+
|
24
|
+
hash.each do |key_wild, value_wild|
|
25
|
+
key = key_wild.to_s.strip
|
26
|
+
|
27
|
+
logger.debug "yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value_wild}<<"
|
28
|
+
|
29
|
+
if value_wild.kind_of?( String ) # assume single fixture name
|
30
|
+
@ary << value_wild
|
31
|
+
elsif value_wild.kind_of?( Array ) # assume array of fixture names as strings
|
32
|
+
@ary = ary + value_wild
|
33
|
+
else
|
34
|
+
logger.error "unknow fixture type in setup (yaml key:#{key_wild.class.name} >>#{key}<<, value:#{value_wild.class.name} >>#{value_wild}<<); skipping"
|
35
|
+
end
|
36
|
+
end
|
37
|
+
|
38
|
+
logger.debug "fixture setup:"
|
39
|
+
logger.debug @ary.to_json
|
40
|
+
end
|
41
|
+
|
42
|
+
def each
|
43
|
+
@ary.each do |fixture|
|
44
|
+
yield( fixture )
|
45
|
+
end
|
46
|
+
end # method each
|
47
|
+
|
48
|
+
end # class FixtureReader
|
@@ -1,5 +1,7 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
|
+
# fix: move into TextUtils namespace/module!!
|
4
|
+
|
3
5
|
class ValuesReader
|
4
6
|
|
5
7
|
include LogUtils::Logging
|
@@ -150,7 +152,7 @@ class ValuesReader
|
|
150
152
|
|
151
153
|
if key_col == '<auto>'
|
152
154
|
## autogenerate key from first title
|
153
|
-
key_col = title_to_key( titles[0] )
|
155
|
+
key_col = TextUtils.title_to_key( titles[0] )
|
154
156
|
logger.debug " autogen key >#{key_col}< from title >#{titles[0]}<, textutils version #{TextUtils::VERSION}"
|
155
157
|
end
|
156
158
|
|
@@ -273,7 +275,7 @@ class ValuesReader
|
|
273
275
|
|
274
276
|
if key_col == '<auto>'
|
275
277
|
## autogenerate key from first title
|
276
|
-
key_col = title_to_key( titles[0] )
|
278
|
+
key_col = TextUtils.title_to_key( titles[0] )
|
277
279
|
logger.debug " autogen key >#{key_col}< from title >#{titles[0]}<, textutils version #{TextUtils::VERSION}"
|
278
280
|
end
|
279
281
|
|
@@ -286,101 +288,7 @@ class ValuesReader
|
|
286
288
|
end # each lines
|
287
289
|
|
288
290
|
end # method each_line
|
289
|
-
|
290
|
-
|
291
291
|
|
292
|
-
def title_to_key( title )
|
293
|
-
|
294
|
-
## NB: downcase does NOT work for accented chars (thus, include in alternatives)
|
295
|
-
key = title.downcase
|
296
|
-
|
297
|
-
### remove optional english translation in square brackets ([]) e.g. Wien [Vienna]
|
298
|
-
key = key.gsub( /\[.+\]/, '' )
|
299
|
-
|
300
|
-
## remove optional longer title part in () e.g. Las Palmas (de Gran Canaria), Palma (de Mallorca)
|
301
|
-
key = key.gsub( /\(.+\)/, '' )
|
302
|
-
|
303
|
-
## remove optional longer title part in {} e.g. Ottakringer {Bio} or {Alkoholfrei}
|
304
|
-
## todo: use for autotags? e.g. {Bio} => bio
|
305
|
-
key = key.gsub( /\{.+\}/, '' )
|
306
|
-
|
307
|
-
## remove all whitespace and punctuation
|
308
|
-
key = key.gsub( /[ \t_\-\.()\[\]'"\/]/, '' )
|
309
|
-
|
310
|
-
## remove special chars (e.g. %°&)
|
311
|
-
key = key.gsub( /[%&°]/, '' )
|
312
|
-
|
313
|
-
## turn accented char into ascii look alike if possible
|
314
|
-
##
|
315
|
-
## todo: add some more
|
316
|
-
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
317
|
-
|
318
|
-
## todo: add unicode codepoint name
|
319
|
-
|
320
|
-
alternatives = [
|
321
|
-
['ß', 'ss'],
|
322
|
-
['æ', 'ae'],
|
323
|
-
['ä', 'ae'],
|
324
|
-
['ā', 'a' ], # e.g. Liepājas
|
325
|
-
['á', 'a' ], # e.g. Bogotá, Králové
|
326
|
-
['ã', 'a' ], # e.g São Paulo
|
327
|
-
['ă', 'a' ], # e.g. Chișinău
|
328
|
-
['â', 'a' ], # e.g Goiânia
|
329
|
-
['å', 'a' ], # e.g. Vålerenga
|
330
|
-
['ą', 'a' ], # e.g. Śląsk
|
331
|
-
['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
|
332
|
-
['ć', 'c' ], # e.g. Budućnost
|
333
|
-
['č', 'c' ], # e.g. Tradiční, Výčepní
|
334
|
-
['é', 'e' ], # e.g. Vélez, Králové
|
335
|
-
['è', 'e' ], # e.g. Rivières
|
336
|
-
['ê', 'e' ], # e.g. Grêmio
|
337
|
-
['ě', 'e' ], # e.g. Budějovice
|
338
|
-
['ĕ', 'e' ], # e.g. Svĕtlý
|
339
|
-
['ė', 'e' ], # e.g. Vėtra
|
340
|
-
['ë', 'e' ], # e.g. Skënderbeu
|
341
|
-
['ğ', 'g' ], # e.g. Qarabağ
|
342
|
-
['ì', 'i' ], # e.g. Potosì
|
343
|
-
['í', 'i' ], # e.g. Ústí
|
344
|
-
['ł', 'l' ], # e.g. Wisła, Wrocław
|
345
|
-
['ñ', 'n' ], # e.g. Porteño
|
346
|
-
['ň', 'n' ], # e.g. Plzeň, Třeboň
|
347
|
-
['ö', 'oe'],
|
348
|
-
['ő', 'o' ], # e.g. Győri
|
349
|
-
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
350
|
-
['õ', 'o' ], # e.g. Nõmme
|
351
|
-
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
352
|
-
['ř', 'r' ], # e.g. Třeboň
|
353
|
-
['ș', 's' ], # e.g. Chișinău, București
|
354
|
-
['ş', 's' ], # e.g. Beşiktaş
|
355
|
-
['š', 's' ], # e.g. Košice
|
356
|
-
['ť', 't' ], # e.g. Měšťan
|
357
|
-
['ü', 'ue'],
|
358
|
-
['ú', 'u' ], # e.g. Fútbol
|
359
|
-
['ū', 'u' ], # e.g. Sūduva
|
360
|
-
['ů', 'u' ], # e.g. Sládkův
|
361
|
-
['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
|
362
|
-
['ý', 'y' ], # e.g. Nefitrovaný
|
363
|
-
['ź', 'z' ], # e.g. Łódź
|
364
|
-
['ž', 'z' ], # e.g. Domžale, Petržalka
|
365
|
-
|
366
|
-
['Č', 'c' ], # e.g. České
|
367
|
-
['İ', 'i' ], # e.g. İnter
|
368
|
-
['Í', 'i' ], # e.g. ÍBV
|
369
|
-
['Ł', 'l' ], # e.g. Łódź
|
370
|
-
['Ö', 'oe' ], # e.g. Örebro
|
371
|
-
['Ř', 'r' ], # e.g. Řezák
|
372
|
-
['Ś', 's' ], # e.g. Śląsk
|
373
|
-
['Š', 's' ], # e.g. MŠK
|
374
|
-
['Ş', 's' ], # e.g. Şüvälan
|
375
|
-
['Ú', 'u' ], # e.g. Ústí, Újpest
|
376
|
-
['Ž', 'z' ] # e.g. Žilina
|
377
|
-
]
|
378
|
-
|
379
|
-
alternatives.each do |alt|
|
380
|
-
key = key.gsub( alt[0], alt[1] )
|
381
|
-
end
|
382
292
|
|
383
|
-
key
|
384
|
-
end # method title_to_key
|
385
293
|
|
386
294
|
end # class ValuesReader
|
data/lib/textutils/utils.rb
CHANGED
@@ -1,6 +1,15 @@
|
|
1
1
|
# encoding: utf-8
|
2
2
|
|
3
3
|
|
4
|
+
|
5
|
+
module TextUtils
|
6
|
+
# make helpers available as class methods e.g. TextUtils.convert_unicode_dashes_to_plain_ascii
|
7
|
+
extend UnicodeHelper
|
8
|
+
extend TitleHelper
|
9
|
+
end
|
10
|
+
|
11
|
+
|
12
|
+
|
4
13
|
class File
|
5
14
|
def self.read_utf8( path )
|
6
15
|
text = open( path, 'r:bom|utf-8' ) do |file|
|
@@ -8,107 +17,16 @@ class File
|
|
8
17
|
end
|
9
18
|
|
10
19
|
# NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
|
11
|
-
text = convert_unicode_dashes_to_plain_ascii( text, path: path )
|
20
|
+
text = TextUtils.convert_unicode_dashes_to_plain_ascii( text, path: path )
|
12
21
|
|
13
22
|
text
|
14
23
|
end
|
15
24
|
end # class File
|
16
25
|
|
17
26
|
|
18
|
-
# NB:
|
19
|
-
# U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -
|
20
|
-
#
|
21
|
-
# see en.wikipedia.org/wiki/Dash
|
22
|
-
|
23
|
-
U_HYPHEN = "\u2010" # unambigous hyphen
|
24
|
-
U_NON_BREAKING_HYPHEN = "\u2011" # unambigous non-breaking hyphen
|
25
|
-
U_MINUS = "\u2212" # unambigous minus sign (html => −)
|
26
|
-
U_NDASH = "\u2013" # ndash (html => – ascii => --)
|
27
|
-
U_MDASH = "\u2014" # mdash (html => — ascii => ---)
|
28
|
-
|
29
|
-
|
30
|
-
def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
|
31
|
-
|
32
|
-
text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|
|
33
|
-
|
34
|
-
# puts "found U+#{'%04X' % $1.ord} (#{$1})"
|
35
|
-
|
36
|
-
msg = ''
|
37
|
-
|
38
|
-
if $1 == U_HYPHEN
|
39
|
-
msg << "found hyhpen U+2010 (#{$1})"
|
40
|
-
elsif $1 == U_NON_BREAKING_HYPHEN
|
41
|
-
msg << "found non_breaking_hyhpen U+2011 (#{$1})"
|
42
|
-
elsif $1 == U_MINUS
|
43
|
-
msg << "found minus U+2212 (#{$1})"
|
44
|
-
elsif $1 == U_NDASH
|
45
|
-
msg << "found ndash U+2013 (#{$1})"
|
46
|
-
elsif $1 == U_MDASH
|
47
|
-
msg << "found mdash U+2014 (#{$1})"
|
48
|
-
else
|
49
|
-
msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
|
50
|
-
end
|
51
27
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
28
|
+
def title_esc_regex( title_unescaped )
|
29
|
+
puts "*** warn: depreceated fn call: use TextUtils.title_esc_regex() or include TextUtils::TitleHelpers"
|
30
|
+
TextUtils.title_esc_regex( title_unescaped )
|
31
|
+
end
|
56
32
|
|
57
|
-
'-'
|
58
|
-
end
|
59
|
-
|
60
|
-
text
|
61
|
-
end # method convert_unicode_dashes_to_plain_ascii
|
62
|
-
|
63
|
-
|
64
|
-
############
|
65
|
-
### fix/todo: share helper for all text readers/parsers- where to put it?
|
66
|
-
###
|
67
|
-
|
68
|
-
def title_esc_regex( title_unescaped )
|
69
|
-
|
70
|
-
## escape regex special chars e.g. . to \. and ( to \( etc.
|
71
|
-
# e.g. Benfica Lis.
|
72
|
-
# e.g. Club Atlético Colón (Santa Fe)
|
73
|
-
|
74
|
-
## NB: cannot use Regexp.escape! will escape space '' to '\ '
|
75
|
-
## title = Regexp.escape( title_unescaped )
|
76
|
-
title = title_unescaped.gsub( '.', '\.' )
|
77
|
-
title = title.gsub( '(', '\(' )
|
78
|
-
title = title.gsub( ')', '\)' )
|
79
|
-
|
80
|
-
## match accented char with or without accents
|
81
|
-
## add (ü|ue) etc.
|
82
|
-
## also make - optional change to (-| ) e.g. Blau-Weiss == Blau Weiss
|
83
|
-
|
84
|
-
## todo: add some more
|
85
|
-
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
86
|
-
##
|
87
|
-
## reuse for all readers!
|
88
|
-
|
89
|
-
alternatives = [
|
90
|
-
['-', '(-| )'], ## e.g. Blau-Weiß Linz
|
91
|
-
['æ', '(æ|ae)'], ## e.g.
|
92
|
-
['á', '(á|a)'], ## e.g. Bogotá, Sársfield
|
93
|
-
['ã', '(ã|a)'], ## e.g São Paulo
|
94
|
-
['ä', '(ä|ae)'], ## e.g.
|
95
|
-
['ç', '(ç|c)'], ## e.g. Fenerbahçe
|
96
|
-
['é', '(é|e)'], ## e.g. Vélez
|
97
|
-
['ê', '(ê|e)'], ## e.g. Grêmio
|
98
|
-
['ñ', '(ñ|n)'], ## e.g. Porteño
|
99
|
-
['ň', '(ň|n)'], ## e.g. Plzeň
|
100
|
-
['Ö', '(Ö|Oe)'], ## e.g. Österreich
|
101
|
-
['ö', '(ö|oe)'], ## e.g. Mönchengladbach
|
102
|
-
['ó', '(ó|o)'], ## e.g. Colón
|
103
|
-
['ș', '(ș|s)'], ## e.g. Bucarești
|
104
|
-
['ß', '(ß|ss)'], ## e.g. Blau-Weiß Linz
|
105
|
-
['ü', '(ü|ue)'], ## e.g.
|
106
|
-
['ú', '(ú|u)'] ## e.g. Fútbol
|
107
|
-
]
|
108
|
-
|
109
|
-
alternatives.each do |alt|
|
110
|
-
title = title.gsub( alt[0], alt[1] )
|
111
|
-
end
|
112
|
-
|
113
|
-
title
|
114
|
-
end
|
data/lib/textutils/version.rb
CHANGED
data/lib/textutils.rb
CHANGED
@@ -21,10 +21,13 @@ require 'textutils/filter/comment_filter'
|
|
21
21
|
require 'textutils/filter/erb_django_filter'
|
22
22
|
require 'textutils/filter/erb_filter'
|
23
23
|
|
24
|
+
require 'textutils/helper/unicode_helper'
|
25
|
+
require 'textutils/helper/title_helper'
|
26
|
+
|
24
27
|
require 'textutils/utils'
|
25
28
|
require 'textutils/reader/code_reader'
|
26
29
|
require 'textutils/reader/hash_reader'
|
27
30
|
require 'textutils/reader/line_reader'
|
28
31
|
require 'textutils/reader/values_reader'
|
29
|
-
|
32
|
+
require 'textutils/reader/fixture_reader'
|
30
33
|
|
data/test/helper.rb
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
|
2
|
+
## $:.unshift(File.dirname(__FILE__))
|
3
|
+
|
4
|
+
## minitest setup
|
5
|
+
|
6
|
+
# require 'minitest/unit'
|
7
|
+
require 'minitest/autorun'
|
8
|
+
|
9
|
+
# include MiniTest::Unit # lets us use TestCase instead of MiniTest::Unit::TestCase
|
10
|
+
|
11
|
+
|
12
|
+
## our own code
|
13
|
+
|
14
|
+
require 'textutils'
|
data/test/test_helper.rb
ADDED
@@ -0,0 +1,36 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
|
3
|
+
###
|
4
|
+
# to run use
|
5
|
+
# ruby -I ./lib -I ./test test/test_helper.rb
|
6
|
+
# or better
|
7
|
+
# rake test
|
8
|
+
|
9
|
+
require 'helper'
|
10
|
+
|
11
|
+
class TestHelper < MiniTest::Unit::TestCase
|
12
|
+
|
13
|
+
def test_convert_unicode_dashes
|
14
|
+
|
15
|
+
txt_in = "\u2010 \u2011 \u2212 \u2013 \u2014" # NB: unicode chars require double quoted strings
|
16
|
+
txt_out = '- - - - -'
|
17
|
+
|
18
|
+
assert( txt_out == TextUtils.convert_unicode_dashes_to_plain_ascii( txt_in ) )
|
19
|
+
end
|
20
|
+
|
21
|
+
|
22
|
+
def test_title_to_key
|
23
|
+
|
24
|
+
txt_io = [
|
25
|
+
[ 'São Paulo', 'saopaulo' ],
|
26
|
+
[ 'São Gonçalo', 'saogoncalo' ],
|
27
|
+
[ 'Výčepní', 'vycepni' ]
|
28
|
+
]
|
29
|
+
|
30
|
+
txt_io.each do |txt|
|
31
|
+
assert( txt[1] == TextUtils.title_to_key( txt[0] ) )
|
32
|
+
end
|
33
|
+
end
|
34
|
+
|
35
|
+
|
36
|
+
end # class TestHelper
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.10
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-05-
|
12
|
+
date: 2013-05-08 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &72786300 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *72786300
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &72786080 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '3.10'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *72786080
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &72785860 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '3.3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *72785860
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
48
|
email: webslideshow@googlegroups.com
|
49
49
|
executables: []
|
@@ -60,12 +60,18 @@ files:
|
|
60
60
|
- lib/textutils/filter/comment_filter.rb
|
61
61
|
- lib/textutils/filter/erb_django_filter.rb
|
62
62
|
- lib/textutils/filter/erb_filter.rb
|
63
|
+
- lib/textutils/helper/title_helper.rb
|
64
|
+
- lib/textutils/helper/unicode_helper.rb
|
63
65
|
- lib/textutils/reader/code_reader.rb
|
66
|
+
- lib/textutils/reader/fixture_reader.rb
|
64
67
|
- lib/textutils/reader/hash_reader.rb
|
65
68
|
- lib/textutils/reader/line_reader.rb
|
66
69
|
- lib/textutils/reader/values_reader.rb
|
67
70
|
- lib/textutils/utils.rb
|
68
71
|
- lib/textutils/version.rb
|
72
|
+
- test/helper.rb
|
73
|
+
- test/test_helper.rb
|
74
|
+
- .gemtest
|
69
75
|
homepage: http://geraldb.github.com/textutils
|
70
76
|
licenses:
|
71
77
|
- Public Domain
|
@@ -93,4 +99,5 @@ rubygems_version: 1.8.17
|
|
93
99
|
signing_key:
|
94
100
|
specification_version: 3
|
95
101
|
summary: textutils - Text Filters, Helpers, Readers and More
|
96
|
-
test_files:
|
102
|
+
test_files:
|
103
|
+
- test/test_helper.rb
|