textutils 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ Hoe.spec 'textutils' do
18
18
  self.history_file = 'History.markdown'
19
19
 
20
20
  self.extra_deps = [
21
- ['logutils', '>= 0.5.0']
21
+ ['logutils', '~> 0.5'] # e.g. >= 0.5 <= 1.0
22
22
  ]
23
23
 
24
24
  self.licenses = ['Public Domain']
@@ -130,31 +130,59 @@ class ValuesReader
130
130
  ## todo: add some more
131
131
  ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
132
132
 
133
+ ## todo: add unicode codepoint name
134
+
133
135
  alternatives = [
134
136
  ['ß', 'ss'],
135
137
  ['æ', 'ae'],
136
- ['ä', 'ae'],
138
+ ['ä', 'ae'],
139
+ ['ā', 'a' ], # e.g. Liepājas
137
140
  ['á', 'a' ], # e.g. Bogotá, Králové
138
141
  ['ã', 'a' ], # e.g São Paulo
139
142
  ['ă', 'a' ], # e.g. Chișinău
143
+ ['â', 'a' ], # e.g Goiânia
144
+ ['å', 'a' ], # e.g. Vålerenga
145
+ ['ą', 'a' ], # e.g. Śląsk
146
+ ['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
147
+ ['ć', 'c' ], # e.g. Budućnost
140
148
  ['é', 'e' ], # e.g. Vélez, Králové
141
149
  ['è', 'e' ], # e.g. Rivières
142
150
  ['ê', 'e' ], # e.g. Grêmio
143
151
  ['ě', 'e' ], # e.g. Budějovice
152
+ ['ė', 'e' ], # e.g. Vėtra
153
+ ['ë', 'e' ], # e.g. Skënderbeu
154
+ ['ğ', 'g' ], # e.g. Qarabağ
144
155
  ['ì', 'i' ], # e.g. Potosì
145
156
  ['í', 'i' ], # e.g. Ústí
157
+ ['ł', 'l' ], # e.g. Wisła, Wrocław
146
158
  ['ñ', 'n' ], # e.g. Porteño
147
159
  ['ň', 'n' ], # e.g. Plzeň, Třeboň
148
- ['ö', 'oe'],
160
+ ['ö', 'oe'],
161
+ ['ő', 'o' ], # e.g. Győri
149
162
  ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
163
+ ['õ', 'o' ], # e.g. Nõmme
164
+ ['ø', 'o' ], # e.g. Fuglafjørdur, København
150
165
  ['ř', 'r' ], # e.g. Třeboň
151
- ['ș', 's' ], # e.g. Chișinău
152
- ['ü', 'ue'],
166
+ ['ș', 's' ], # e.g. Chișinău, București
167
+ ['ş', 's' ], # e.g. Beşiktaş
168
+ ['š', 's' ], # e.g. Košice
169
+ ['ü', 'ue'],
153
170
  ['ú', 'u' ], # e.g. Fútbol
171
+ ['ū', 'u' ], # e.g. Sūduva
172
+ ['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
154
173
  ['ź', 'z' ], # e.g. Łódź
174
+ ['ž', 'z' ], # e.g. Domžale, Petržalka
175
+
155
176
  ['Č', 'c' ], # e.g. České
177
+ ['İ', 'i' ], # e.g. İnter
178
+ ['Í', 'i' ], # e.g. ÍBV
156
179
  ['Ł', 'l' ], # e.g. Łódź
157
- ['Ú', 'u' ], # e.g. Ústí
180
+ ['Ö', 'oe' ], # e.g. Örebro
181
+ ['Ś', 's' ], # e.g. Śląsk
182
+ ['Š', 's' ], # e.g. MŠK
183
+ ['Ş', 's' ], # e.g. Şüvälan
184
+ ['Ú', 'u' ], # e.g. Ústí, Újpest
185
+ ['Ž', 'z' ] # e.g. Žilina
158
186
  ]
159
187
 
160
188
  alternatives.each do |alt|
@@ -6,18 +6,61 @@ class File
6
6
  text = open( path, 'r:bom|utf-8' ) do |file|
7
7
  file.read
8
8
  end
9
- ### for convenience for now convert fancy dash to "plain" dash
10
- ## todo: check char codes for "fancy" dash alternatives
11
- text.gsub!( /—|–/ ) do |_|
12
- puts "*** warning: convert fancy dash to 'plain' dash in file >#{path}<"
13
- ### exit 1 #### do NOT tolerate!! cleanup dashes for now
14
- '-'
15
- end
9
+
10
+ # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
11
+ text = convert_unicode_dashes_to_plain_ascii( text, path: path )
12
+
16
13
  text
17
14
  end
18
15
  end # class File
19
16
 
17
+
18
+ # NB:
19
+ # U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -
20
+ #
21
+ # see en.wikipedia.org/wiki/Dash
22
+
23
+ U_HYPHEN = "\u2010" # unambigous hyphen
24
+ U_NON_BREAKING_HYPHEN = "\u2011" # unambigous non-breaking hyphen
25
+ U_MINUS = "\u2212" # unambigous minus sign (html => &minus;)
26
+ U_NDASH = "\u2013" # ndash (html => &ndash; ascii => --)
27
+ U_MDASH = "\u2014" # mdash (html => &mdash; ascii => ---)
28
+
29
+
30
+ def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
31
+
32
+ text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|
33
+
34
+ # puts "found U+#{'%04X' % $1.ord} (#{$1})"
35
+
36
+ msg = ''
37
+
38
+ if $1 == U_HYPHEN
39
+ msg << "found hyhpen U+2010 (#{$1})"
40
+ elsif $1 == U_NON_BREAKING_HYPHEN
41
+ msg << "found non_breaking_hyhpen U+2011 (#{$1})"
42
+ elsif $1 == U_MINUS
43
+ msg << "found minus U+2212 (#{$1})"
44
+ elsif $1 == U_NDASH
45
+ msg << "found ndash U+2013 (#{$1})"
46
+ elsif $1 == U_MDASH
47
+ msg << "found mdash U+2014 (#{$1})"
48
+ else
49
+ msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
50
+ end
51
+
52
+ msg << " in file >#{opts[:path]}<" if opts[:path]
53
+ msg << "; converting to plain ascii hyphen_minus (-)"
20
54
 
55
+ puts "*** warning: #{msg}"
56
+
57
+ '-'
58
+ end
59
+
60
+ text
61
+ end # method convert_unicode_dashes_to_plain_ascii
62
+
63
+
21
64
  ############
22
65
  ### fix/todo: share helper for all text readers/parsers- where to put it?
23
66
  ###
@@ -1,7 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.5.1'
4
+ VERSION = '0.5.2'
5
5
 
6
6
  end # module TextUtils
7
-
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-27 00:00:00.000000000 Z
12
+ date: 2013-03-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &75610580 !ruby/object:Gem::Requirement
16
+ requirement: &84567730 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ! '>='
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 0.5.0
21
+ version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *75610580
24
+ version_requirements: *84567730
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &75610310 !ruby/object:Gem::Requirement
27
+ requirement: &84567510 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '3.10'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *75610310
35
+ version_requirements: *84567510
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &75610040 !ruby/object:Gem::Requirement
38
+ requirement: &84567260 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '3.3'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *75610040
46
+ version_requirements: *84567260
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
48
  email: webslideshow@googlegroups.com
49
49
  executables: []