textutils 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Rakefile CHANGED
@@ -18,7 +18,7 @@ Hoe.spec 'textutils' do
18
18
  self.history_file = 'History.markdown'
19
19
 
20
20
  self.extra_deps = [
21
- ['logutils', '>= 0.5.0']
21
+ ['logutils', '~> 0.5'] # e.g. >= 0.5 <= 1.0
22
22
  ]
23
23
 
24
24
  self.licenses = ['Public Domain']
@@ -130,31 +130,59 @@ class ValuesReader
130
130
  ## todo: add some more
131
131
  ## see http://en.wikipedia.org/wiki/List_of_XML_and_HTML_character_entity_references for more
132
132
 
133
+ ## todo: add unicode codepoint name
134
+
133
135
  alternatives = [
134
136
  ['ß', 'ss'],
135
137
  ['æ', 'ae'],
136
- ['ä', 'ae'],
138
+ ['ä', 'ae'],
139
+ ['ā', 'a' ], # e.g. Liepājas
137
140
  ['á', 'a' ], # e.g. Bogotá, Králové
138
141
  ['ã', 'a' ], # e.g São Paulo
139
142
  ['ă', 'a' ], # e.g. Chișinău
143
+ ['â', 'a' ], # e.g Goiânia
144
+ ['å', 'a' ], # e.g. Vålerenga
145
+ ['ą', 'a' ], # e.g. Śląsk
146
+ ['ç', 'c' ], # e.g. São Gonçalo, Iguaçu, Neftçi
147
+ ['ć', 'c' ], # e.g. Budućnost
140
148
  ['é', 'e' ], # e.g. Vélez, Králové
141
149
  ['è', 'e' ], # e.g. Rivières
142
150
  ['ê', 'e' ], # e.g. Grêmio
143
151
  ['ě', 'e' ], # e.g. Budějovice
152
+ ['ė', 'e' ], # e.g. Vėtra
153
+ ['ë', 'e' ], # e.g. Skënderbeu
154
+ ['ğ', 'g' ], # e.g. Qarabağ
144
155
  ['ì', 'i' ], # e.g. Potosì
145
156
  ['í', 'i' ], # e.g. Ústí
157
+ ['ł', 'l' ], # e.g. Wisła, Wrocław
146
158
  ['ñ', 'n' ], # e.g. Porteño
147
159
  ['ň', 'n' ], # e.g. Plzeň, Třeboň
148
- ['ö', 'oe'],
160
+ ['ö', 'oe'],
161
+ ['ő', 'o' ], # e.g. Győri
149
162
  ['ó', 'o' ], # e.g. Colón, Łódź, Kraków
163
+ ['õ', 'o' ], # e.g. Nõmme
164
+ ['ø', 'o' ], # e.g. Fuglafjørdur, København
150
165
  ['ř', 'r' ], # e.g. Třeboň
151
- ['ș', 's' ], # e.g. Chișinău
152
- ['ü', 'ue'],
166
+ ['ș', 's' ], # e.g. Chișinău, București
167
+ ['ş', 's' ], # e.g. Beşiktaş
168
+ ['š', 's' ], # e.g. Košice
169
+ ['ü', 'ue'],
153
170
  ['ú', 'u' ], # e.g. Fútbol
171
+ ['ū', 'u' ], # e.g. Sūduva
172
+ ['ı', 'u' ], # e.g. Bakı # use u?? (Baku) why-why not?
154
173
  ['ź', 'z' ], # e.g. Łódź
174
+ ['ž', 'z' ], # e.g. Domžale, Petržalka
175
+
155
176
  ['Č', 'c' ], # e.g. České
177
+ ['İ', 'i' ], # e.g. İnter
178
+ ['Í', 'i' ], # e.g. ÍBV
156
179
  ['Ł', 'l' ], # e.g. Łódź
157
- ['Ú', 'u' ], # e.g. Ústí
180
+ ['Ö', 'oe' ], # e.g. Örebro
181
+ ['Ś', 's' ], # e.g. Śląsk
182
+ ['Š', 's' ], # e.g. MŠK
183
+ ['Ş', 's' ], # e.g. Şüvälan
184
+ ['Ú', 'u' ], # e.g. Ústí, Újpest
185
+ ['Ž', 'z' ] # e.g. Žilina
158
186
  ]
159
187
 
160
188
  alternatives.each do |alt|
@@ -6,18 +6,61 @@ class File
6
6
  text = open( path, 'r:bom|utf-8' ) do |file|
7
7
  file.read
8
8
  end
9
- ### for convenience for now convert fancy dash to "plain" dash
10
- ## todo: check char codes for "fancy" dash alternatives
11
- text.gsub!( /—|–/ ) do |_|
12
- puts "*** warning: convert fancy dash to 'plain' dash in file >#{path}<"
13
- ### exit 1 #### do NOT tolerate!! cleanup dashes for now
14
- '-'
15
- end
9
+
10
+ # NB: for convenience: convert fancy unicode dashes/hyphens to plain ascii hyphen-minus
11
+ text = convert_unicode_dashes_to_plain_ascii( text, path: path )
12
+
16
13
  text
17
14
  end
18
15
  end # class File
19
16
 
17
+
18
+ # NB:
19
+ # U_HYPHEN_MINUS is standard ascii hyphen/minus e.g. -
20
+ #
21
+ # see en.wikipedia.org/wiki/Dash
22
+
23
+ U_HYPHEN = "\u2010" # unambigous hyphen
24
+ U_NON_BREAKING_HYPHEN = "\u2011" # unambigous non-breaking hyphen
25
+ U_MINUS = "\u2212" # unambigous minus sign (html => &minus;)
26
+ U_NDASH = "\u2013" # ndash (html => &ndash; ascii => --)
27
+ U_MDASH = "\u2014" # mdash (html => &mdash; ascii => ---)
28
+
29
+
30
+ def convert_unicode_dashes_to_plain_ascii( text, opts = {} )
31
+
32
+ text = text.gsub( /(#{U_HYPHEN}|#{U_NON_BREAKING_HYPHEN}|#{U_MINUS}|#{U_NDASH}|#{U_MDASH})/ ) do |_|
33
+
34
+ # puts "found U+#{'%04X' % $1.ord} (#{$1})"
35
+
36
+ msg = ''
37
+
38
+ if $1 == U_HYPHEN
39
+ msg << "found hyhpen U+2010 (#{$1})"
40
+ elsif $1 == U_NON_BREAKING_HYPHEN
41
+ msg << "found non_breaking_hyhpen U+2011 (#{$1})"
42
+ elsif $1 == U_MINUS
43
+ msg << "found minus U+2212 (#{$1})"
44
+ elsif $1 == U_NDASH
45
+ msg << "found ndash U+2013 (#{$1})"
46
+ elsif $1 == U_MDASH
47
+ msg << "found mdash U+2014 (#{$1})"
48
+ else
49
+ msg << "found unknown unicode dash U+#{'%04X' % $1.ord} (#{$1})"
50
+ end
51
+
52
+ msg << " in file >#{opts[:path]}<" if opts[:path]
53
+ msg << "; converting to plain ascii hyphen_minus (-)"
20
54
 
55
+ puts "*** warning: #{msg}"
56
+
57
+ '-'
58
+ end
59
+
60
+ text
61
+ end # method convert_unicode_dashes_to_plain_ascii
62
+
63
+
21
64
  ############
22
65
  ### fix/todo: share helper for all text readers/parsers- where to put it?
23
66
  ###
@@ -1,7 +1,6 @@
1
1
 
2
2
  module TextUtils
3
3
 
4
- VERSION = '0.5.1'
4
+ VERSION = '0.5.2'
5
5
 
6
6
  end # module TextUtils
7
-
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: textutils
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,22 +9,22 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2013-02-27 00:00:00.000000000 Z
12
+ date: 2013-03-16 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: logutils
16
- requirement: &75610580 !ruby/object:Gem::Requirement
16
+ requirement: &84567730 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
- - - ! '>='
19
+ - - ~>
20
20
  - !ruby/object:Gem::Version
21
- version: 0.5.0
21
+ version: '0.5'
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *75610580
24
+ version_requirements: *84567730
25
25
  - !ruby/object:Gem::Dependency
26
26
  name: rdoc
27
- requirement: &75610310 !ruby/object:Gem::Requirement
27
+ requirement: &84567510 !ruby/object:Gem::Requirement
28
28
  none: false
29
29
  requirements:
30
30
  - - ~>
@@ -32,10 +32,10 @@ dependencies:
32
32
  version: '3.10'
33
33
  type: :development
34
34
  prerelease: false
35
- version_requirements: *75610310
35
+ version_requirements: *84567510
36
36
  - !ruby/object:Gem::Dependency
37
37
  name: hoe
38
- requirement: &75610040 !ruby/object:Gem::Requirement
38
+ requirement: &84567260 !ruby/object:Gem::Requirement
39
39
  none: false
40
40
  requirements:
41
41
  - - ~>
@@ -43,7 +43,7 @@ dependencies:
43
43
  version: '3.3'
44
44
  type: :development
45
45
  prerelease: false
46
- version_requirements: *75610040
46
+ version_requirements: *84567260
47
47
  description: textutils - Text Filters, Helpers, Readers and More
48
48
  email: webslideshow@googlegroups.com
49
49
  executables: []