textutils 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Rakefile +1 -1
- data/lib/textutils/reader/values_reader.rb +33 -5
- data/lib/textutils/utils.rb +50 -7
- data/lib/textutils/version.rb +1 -2
- metadata +10 -10
data/Rakefile
CHANGED
@@ -130,31 +130,59 @@ class ValuesReader
|
|
130
130
|
## todo: add some more
|
131
131
|
## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
|
132
132
|
|
133
|
+
## todo: add unicode codepoint name
|
134
|
+
|
133
135
|
alternatives = [
|
134
136
|
['ß', 'ss'],
|
135
137
|
['æ', 'ae'],
|
136
|
-
['ä', 'ae'],
|
138
|
+
['ä', 'ae'],
|
139
|
+
['ā', 'a' ], # e.g. Liepājas
|
137
140
|
['á', 'a' ], # e.g. Bogotá, Králové
|
138
141
|
['ã', 'a' ], # e.g São Paulo
|
139
142
|
['ă', 'a' ], # e.g. Chișinău
|
143
|
+
['â', 'a' ], # e.g Goiânia
|
144
|
+
['å', 'a' ], # e.g. Vålerenga
|
145
|
+
['ą', 'a' ], # e.g. Śląsk
|
146
|
+
['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
|
147
|
+
['ć', 'c' ], # e.g. Budućnost
|
140
148
|
['é', 'e' ], # e.g. Vélez, Králové
|
141
149
|
['è', 'e' ], # e.g. Rivières
|
142
150
|
['ê', 'e' ], # e.g. Grêmio
|
143
151
|
['ě', 'e' ], # e.g. Budějovice
|
152
|
+
['ė', 'e' ], # e.g. Vėtra
|
153
|
+
['ë', 'e' ], # e.g. Skënderbeu
|
154
|
+
['ğ', 'g' ], # e.g. Qarabağ
|
144
155
|
['ì', 'i' ], # e.g. Potosì
|
145
156
|
['í', 'i' ], # e.g. Ústí
|
157
|
+
['ł', 'l' ], # e.g. Wisła, Wrocław
|
146
158
|
['ñ', 'n' ], # e.g. Porteño
|
147
159
|
['ň', 'n' ], # e.g. Plzeň, Třeboň
|
148
|
-
['ö', 'oe'],
|
160
|
+
['ö', 'oe'],
|
161
|
+
['ő', 'o' ], # e.g. Győri
|
149
162
|
['ó', 'o' ], # e.g. Colón, Łódź, Kraków
|
163
|
+
['õ', 'o' ], # e.g. Nõmme
|
164
|
+
['ø', 'o' ], # e.g. Fuglafjørdur, København
|
150
165
|
['ř', 'r' ], # e.g. Třeboň
|
151
|
-
['ș', 's' ], # e.g. Chișinău
|
152
|
-
['
|
166
|
+
['ș', 's' ], # e.g. Chișinău, București
|
167
|
+
['ş', 's' ], # e.g. Beşiktaş
|
168
|
+
['š', 's' ], # e.g. Košice
|
169
|
+
['ü', 'ue'],
|
153
170
|
['ú', 'u' ], # e.g. Fútbol
|
171
|
+
['ū', 'u' ], # e.g. Sūduva
|
172
|
+
['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
|
154
173
|
['ź', 'z' ], # e.g. Łódź
|
174
|
+
['ž', 'z' ], # e.g. Domžale, Petržalka
|
175
|
+
|
155
176
|
['Č', 'c' ], # e.g. České
|
177
|
+
['İ', 'i' ], # e.g. İnter
|
178
|
+
['Í', 'i' ], # e.g. ÍBV
|
156
179
|
['Ł', 'l' ], # e.g. Łódź
|
157
|
-
['
|
180
|
+
['Ö', 'oe' ], # e.g. Örebro
|
181
|
+
['Ś', 's' ], # e.g. Śląsk
|
182
|
+
['Š', 's' ], # e.g. MŠK
|
183
|
+
['Ş', 's' ], # e.g. Şüvälan
|
184
|
+
['Ú', 'u' ], # e.g. Ústí, Újpest
|
185
|
+
['Ž', 'z' ] # e.g. Žilina
|
158
186
|
]
|
159
187
|
|
160
188
|
alternatives.each do |alt|
|
data/lib/textutils/utils.rb
CHANGED
@@ -6,18 +6,61 @@ class File
|
|
6
6
|
text = open( path, 'r:bom|utf-8' ) do |file|
|
7
7
|
file.read
|
8
8
|
end
|
9
|
-
|
10
|
-
|
11
|
-
text
|
12
|
-
|
13
|
-
### exit 1 #### do NOT tolerate!! cleanup dashes for now
|
14
|
-
'-'
|
15
|
-
end
|
9
|
+
|
10
|
+
# NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
|
11
|
+
text = convert_unicode_dashes_to_plain_ascii( text, path: path )
|
12
|
+
|
16
13
|
text
|
17
14
|
end
|
18
15
|
end # class File
|
19
16
|
|
17
|
+
|
18
|
+
# NB:
|
19
|
+
# U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -
|
20
|
+
#
|
21
|
+
# see en.wikipedia.org/wiki/Dash
|
22
|
+
|
23
|
+
U_HYPHEN = "\u2010" # unambigous hyphen
|
24
|
+
U_NON_BREAKING_HYPHEN = "\u2011" # unambigous non-breaking hyphen
|
25
|
+
U_MINUS = "\u2212" # unambigous minus sign (html => −)
|
26
|
+
U_NDASH = "\u2013" # ndash (html => – ascii => --)
|
27
|
+
U_MDASH = "\u2014" # mdash (html => — ascii => ---)
|
28
|
+
|
29
|
+
|
30
|
+
def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
|
31
|
+
|
32
|
+
text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|
|
33
|
+
|
34
|
+
# puts "found U+#{'%04X' % $1.ord} (#{$1})"
|
35
|
+
|
36
|
+
msg = ''
|
37
|
+
|
38
|
+
if $1 == U_HYPHEN
|
39
|
+
msg << "found hyhpen U+2010 (#{$1})"
|
40
|
+
elsif $1 == U_NON_BREAKING_HYPHEN
|
41
|
+
msg << "found non_breaking_hyhpen U+2011 (#{$1})"
|
42
|
+
elsif $1 == U_MINUS
|
43
|
+
msg << "found minus U+2212 (#{$1})"
|
44
|
+
elsif $1 == U_NDASH
|
45
|
+
msg << "found ndash U+2013 (#{$1})"
|
46
|
+
elsif $1 == U_MDASH
|
47
|
+
msg << "found mdash U+2014 (#{$1})"
|
48
|
+
else
|
49
|
+
msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
|
50
|
+
end
|
51
|
+
|
52
|
+
msg << " in file >#{opts[:path]}<" if opts[:path]
|
53
|
+
msg << "; converting to plain ascii hyphen_minus (-)"
|
20
54
|
|
55
|
+
puts "*** warning: #{msg}"
|
56
|
+
|
57
|
+
'-'
|
58
|
+
end
|
59
|
+
|
60
|
+
text
|
61
|
+
end # method convert_unicode_dashes_to_plain_ascii
|
62
|
+
|
63
|
+
|
21
64
|
############
|
22
65
|
### fix/todo: share helper for all text readers/parsers- where to put it?
|
23
66
|
###
|
data/lib/textutils/version.rb
CHANGED
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: textutils
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,22 +9,22 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2013-
|
12
|
+
date: 2013-03-16 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: logutils
|
16
|
-
requirement: &
|
16
|
+
requirement: &84567730 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
|
-
- -
|
19
|
+
- - ~>
|
20
20
|
- !ruby/object:Gem::Version
|
21
|
-
version: 0.5
|
21
|
+
version: '0.5'
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *84567730
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rdoc
|
27
|
-
requirement: &
|
27
|
+
requirement: &84567510 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ~>
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: '3.10'
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *84567510
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: hoe
|
38
|
-
requirement: &
|
38
|
+
requirement: &84567260 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ~>
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '3.3'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *84567260
|
47
47
|
description: textutils - Text Filters, Helpers, Readers and More
|
48
48
|
email: webslideshow@googlegroups.com
|
49
49
|
executables: []
|