asciidammit2 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +33 -0
- data/Rakefile +10 -0
- data/asciidammit2.gemspec +25 -0
- data/lib/asciidammit.rb +465 -0
- data/lib/asciidammit/version.rb +3 -0
- data/spec/asciidammit_spec.rb +32 -0
- metadata +113 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: f5a506cf717abed78a7676b3853183c5a8c98664
|
4
|
+
data.tar.gz: 933891e3f1c4d3ae86193684fc70b8cdd09b589b
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 295a44d49826a51249e031264b8dd25f0abcd161f21ed35abe3afb1792c37dea8235854a8a6109f477585cce44e4dc5ab38fd0eca4ea1335206ebbfb7c5db308
|
7
|
+
data.tar.gz: 608f16edac803b6b3aca83ccc34f458ba35d906e0ba3163db820bae5d534bb88627d3c80bc176e6c50dc20ea77ef4565818cd6eba80570b9465818d681c8f852
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Forrest Chang
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,33 @@
|
|
1
|
+
# Asciidammit2
|
2
|
+
|
3
|
+
A port of asciidammit.py. Originally created to strip out characters
|
4
|
+
that were giving our email provider problems. Extracted for general
|
5
|
+
use. Another asciidammit gem exists with a different API
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application's Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'asciidammit2'
|
13
|
+
```
|
14
|
+
|
15
|
+
And then execute:
|
16
|
+
|
17
|
+
$ bundle
|
18
|
+
|
19
|
+
Or install it yourself as:
|
20
|
+
|
21
|
+
$ gem install asciidammit2
|
22
|
+
|
23
|
+
## Usage
|
24
|
+
|
25
|
+
ascii_str = Asciidammit.demoronize(string_with_weird_chars)
|
26
|
+
|
27
|
+
## Contributing
|
28
|
+
|
29
|
+
1. Fork it ( https://github.com/[my-github-username]/asciidammit2/fork )
|
30
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
31
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
32
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
33
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'asciidammit/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = 'asciidammit2'
|
8
|
+
spec.version = Asciidammit::VERSION
|
9
|
+
spec.authors = ['Forrest Chang']
|
10
|
+
spec.email = ['fkc_email-ruby@yahoo.com']
|
11
|
+
spec.summary = %q{ A straight port of asciidammit.py.}
|
12
|
+
spec.description = %q{ A straight port of asciidammit.py used for a work project, made into a gem to be used elsewhere. There is an asciidammit gem which sports a different interface.}
|
13
|
+
spec.homepage = 'https://github.com/fkchang/asciidammit2'
|
14
|
+
spec.license = 'MIT'
|
15
|
+
|
16
|
+
spec.files = `git ls-files -z`.split("\x0")
|
17
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
18
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
19
|
+
spec.require_paths = ['lib']
|
20
|
+
|
21
|
+
spec.add_development_dependency 'bundler', '~> 1.7'
|
22
|
+
spec.add_development_dependency 'rake', '~> 10.0'
|
23
|
+
spec.add_development_dependency 'rspec'
|
24
|
+
spec.add_development_dependency 'rspec-nc'
|
25
|
+
end
|
data/lib/asciidammit.rb
ADDED
@@ -0,0 +1,465 @@
|
|
1
|
+
require "asciidammit/version"
|
2
|
+
|
3
|
+
# ported from asciidammit.py
|
4
|
+
# different API and classname (Asciidammit vs AsciiDammit)the the existing asciidammit gem
|
5
|
+
module Asciidammit
|
6
|
+
|
7
|
+
CP1252_CHARS = { 0x80 => ['EUR', 'euro'],
|
8
|
+
0x81 => [' ', ' '],
|
9
|
+
0x82 => [',', 'sbquo'],
|
10
|
+
0x83 => ['f', 'fnof'],
|
11
|
+
0x84 => [',,', 'bdquo'],
|
12
|
+
0x85 => ['...', 'hellip'],
|
13
|
+
0x86 => ['+', 'dagger'],
|
14
|
+
0x87 => ['++', 'Dagger'],
|
15
|
+
0x88 => ['^', 'caret'],
|
16
|
+
0x89 => ['%','%'],
|
17
|
+
0x8A => ['S', 'Scaron'],
|
18
|
+
0x8B => ['<', 'lt;'],
|
19
|
+
0x8C => ['OE', 'OElig'],
|
20
|
+
0x8D => ['?','?'],
|
21
|
+
0x8E => ['Z', 'Z'],
|
22
|
+
0x8F => ['?','?'],
|
23
|
+
0x90 => ['?', '?'],
|
24
|
+
0x91 => ["'", 'lsquo'],
|
25
|
+
0x92 => ["'", 'rsquo'],
|
26
|
+
0x93 => ['"', 'ldquo'],
|
27
|
+
0x94 => ['"', 'rdquo'],
|
28
|
+
0x95 => ['*', 'bull'],
|
29
|
+
0x96 => ['-', 'ndash'],
|
30
|
+
0x97 => ['--', 'mdash'],
|
31
|
+
0x98 => ['~', 'tilde'],
|
32
|
+
0x99 => ['[TM]', 'trade'],
|
33
|
+
0x9a => ['s', 'scaron'],
|
34
|
+
0x9b => ['>', 'gt'],
|
35
|
+
0x9c => ['oe', 'oelig'],
|
36
|
+
0x9d => ['?', '?'],
|
37
|
+
0x9e => ['z', 'z'],
|
38
|
+
0x9f => ['Y', 'Yuml'],
|
39
|
+
0xa0 => [' ', 'nbsp'],
|
40
|
+
0xa1 => ['!', 'iexcl'],
|
41
|
+
0xa2 => ['c', 'cent'],
|
42
|
+
0xa3 => ['GBP', 'pound'],
|
43
|
+
0xa4 => ['$', 'curren'], #This approximation is especially lame.
|
44
|
+
0xa5 => ['YEN', 'yen'],
|
45
|
+
0xa6 => ['|', 'brvbar'],
|
46
|
+
0xa7 => ['S', 'sect'],
|
47
|
+
0xa8 => ['..', 'uml'],
|
48
|
+
0xa9 => ['(c)', 'copy'],
|
49
|
+
0xaa => ['[th]', 'ordf'],
|
50
|
+
0xab => ['<<', 'laquo'],
|
51
|
+
0xac => ['!', 'not'],
|
52
|
+
0xad => [' ', 'shy'],
|
53
|
+
0xae => ['[R]', 'reg'],
|
54
|
+
0xaf => ['-', 'macr'],
|
55
|
+
0xb0 => ['o', 'deg'],
|
56
|
+
0xb1 => ['+-', 'plusmm'],
|
57
|
+
0xb2 => ['2', 'sup2'],
|
58
|
+
0xb3 => ['3', 'sup3'],
|
59
|
+
0xb4 => ["'", 'acute'],
|
60
|
+
0xb5 => ['u', 'micro'],
|
61
|
+
0xb6 => ['P', 'para'],
|
62
|
+
0xb7 => ['*', 'middot'],
|
63
|
+
0xb8 => [',', 'cedil'],
|
64
|
+
0xb9 => ['1', 'sup1'],
|
65
|
+
0xba => ['[th]', 'ordm'],
|
66
|
+
0xbb => ['>>', 'raquo'],
|
67
|
+
0xbc => ['1/4', 'frac14'],
|
68
|
+
0xbd => ['1/2', 'frac12'],
|
69
|
+
0xbe => ['3/4', 'frac34'],
|
70
|
+
0xbf => ['?', 'iquest'],
|
71
|
+
0xc0 => ['A', "Agrave"],
|
72
|
+
0xc1 => ['A', "Aacute"],
|
73
|
+
0xc2 => ['A', "Acirc"],
|
74
|
+
0xc3 => ['A', "Atilde"],
|
75
|
+
0xc4 => ['A', "Auml"],
|
76
|
+
0xc5 => ['A', "Aring"],
|
77
|
+
0xc6 => ['AE', "Aelig"],
|
78
|
+
0xc7 => ['C', "Ccedil"],
|
79
|
+
0xc8 => ['E', "Egrave"],
|
80
|
+
0xc9 => ['E', "Eacute"],
|
81
|
+
0xca => ['E', "Ecirc"],
|
82
|
+
0xcb => ['E', "Euml"],
|
83
|
+
0xcc => ['I', "Igrave"],
|
84
|
+
0xcd => ['I', "Iacute"],
|
85
|
+
0xce => ['I', "Icirc"],
|
86
|
+
0xcf => ['I', "Iuml"],
|
87
|
+
0xd0 => ['D', "Eth"],
|
88
|
+
0xd1 => ['N', "Ntilde"],
|
89
|
+
0xd2 => ['O', "Ograve"],
|
90
|
+
0xd3 => ['O', "Oacute"],
|
91
|
+
0xd4 => ['O', "Ocirc"],
|
92
|
+
0xd5 => ['O', "Otilde"],
|
93
|
+
0xd6 => ['O', "Ouml"],
|
94
|
+
0xd7 => ['*', "times"],
|
95
|
+
0xd8 => ['O', "Oslash"],
|
96
|
+
0xd9 => ['U', "Ugrave"],
|
97
|
+
0xda => ['U', "Uacute"],
|
98
|
+
0xdb => ['U', "Ucirc"],
|
99
|
+
0xdc => ['U', "Uuml"],
|
100
|
+
0xdd => ['Y', "Yacute"],
|
101
|
+
0xde => ['b', "Thorn"],
|
102
|
+
0xdf => ['B', "szlig"],
|
103
|
+
0xe0 => ['a', "agrave"],
|
104
|
+
0xe1 => ['a', "aacute"],
|
105
|
+
0xe2 => ['a', "acirc"],
|
106
|
+
0xe3 => ['a', "atilde"],
|
107
|
+
0xe4 => ['a', "auml"],
|
108
|
+
0xe5 => ['a', "aring"],
|
109
|
+
0xe6 => ['ae', "aelig"],
|
110
|
+
0xe7 => ['c', "ccedil"],
|
111
|
+
0xe8 => ['e', "egrave"],
|
112
|
+
0xe9 => ['e', "eacute"],
|
113
|
+
0xea => ['e', "ecirc"],
|
114
|
+
0xeb => ['e', "euml"],
|
115
|
+
0xec => ['i', "igrave"],
|
116
|
+
0xed => ['i', "iacute"],
|
117
|
+
0xee => ['i', "icirc"],
|
118
|
+
0xef => ['i', "iuml"],
|
119
|
+
0xf0 => ['o', "eth"],
|
120
|
+
0xf1 => ['n', "ntilde"],
|
121
|
+
0xf2 => ['o', "ograve"],
|
122
|
+
0xf3 => ['o', "oacute"],
|
123
|
+
0xf4 => ['o', "ocirc"],
|
124
|
+
0xf5 => ['o', "otilde"],
|
125
|
+
0xf6 => ['o', "ouml"],
|
126
|
+
0xf7 => ['/', "divide"],
|
127
|
+
0xf8 => ['o', "oslash"],
|
128
|
+
0xf9 => ['u', "ugrave"],
|
129
|
+
0xfa => ['u', "uacute"],
|
130
|
+
0xfb => ['u', "ucirc"],
|
131
|
+
0xfc => ['u', "uuml"],
|
132
|
+
0xfd => ['y', "yacute"],
|
133
|
+
0xfe => ['b', "thorn"],
|
134
|
+
0xff => ['y', "yuml"],
|
135
|
+
# added by me for test crashing Vertical Response
|
136
|
+
0x2010 => ['-', "#x2010"],
|
137
|
+
0x2011 => ['-', "#x2011"],
|
138
|
+
0x2012 => ['-', "#x2012"],
|
139
|
+
}
|
140
|
+
|
141
|
+
# from http://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/WINDOWS/CP1252.TXT
|
142
|
+
CP1252_MAP = {
|
143
|
+
#
|
144
|
+
# Name: cp1252 to Unicode table
|
145
|
+
# Unicode version: 2.0
|
146
|
+
# Table version: 2.01
|
147
|
+
# Table format: Format A
|
148
|
+
# Date: 04/15/98
|
149
|
+
#
|
150
|
+
# Contact: Shawn.Steele@microsoft.com
|
151
|
+
#
|
152
|
+
# General notes: none
|
153
|
+
#
|
154
|
+
# Format: Three tab-separated columns
|
155
|
+
# Column #1 is the cp1252 code (in hex)
|
156
|
+
# Column #2 is the Unicode (in hex as 0xXXXX)
|
157
|
+
# Column #3 is the Unicode name (follows a comment sign, '#')
|
158
|
+
#
|
159
|
+
# The entries are in cp1252 order
|
160
|
+
#
|
161
|
+
0x00 => 0x0000, #NULL
|
162
|
+
0x01 => 0x0001, #START OF HEADING
|
163
|
+
0x02 => 0x0002, #START OF TEXT
|
164
|
+
0x03 => 0x0003, #END OF TEXT
|
165
|
+
0x04 => 0x0004, #END OF TRANSMISSION
|
166
|
+
0x05 => 0x0005, #ENQUIRY
|
167
|
+
0x06 => 0x0006, #ACKNOWLEDGE
|
168
|
+
0x07 => 0x0007, #BELL
|
169
|
+
0x08 => 0x0008, #BACKSPACE
|
170
|
+
0x09 => 0x0009, #HORIZONTAL TABULATION
|
171
|
+
0x0A => 0x000A, #LINE FEED
|
172
|
+
0x0B => 0x000B, #VERTICAL TABULATION
|
173
|
+
0x0C => 0x000C, #FORM FEED
|
174
|
+
0x0D => 0x000D, #CARRIAGE RETURN
|
175
|
+
0x0E => 0x000E, #SHIFT OUT
|
176
|
+
0x0F => 0x000F, #SHIFT IN
|
177
|
+
0x10 => 0x0010, #DATA LINK ESCAPE
|
178
|
+
0x11 => 0x0011, #DEVICE CONTROL ONE
|
179
|
+
0x12 => 0x0012, #DEVICE CONTROL TWO
|
180
|
+
0x13 => 0x0013, #DEVICE CONTROL THREE
|
181
|
+
0x14 => 0x0014, #DEVICE CONTROL FOUR
|
182
|
+
0x15 => 0x0015, #NEGATIVE ACKNOWLEDGE
|
183
|
+
0x16 => 0x0016, #SYNCHRONOUS IDLE
|
184
|
+
0x17 => 0x0017, #END OF TRANSMISSION BLOCK
|
185
|
+
0x18 => 0x0018, #CANCEL
|
186
|
+
0x19 => 0x0019, #END OF MEDIUM
|
187
|
+
0x1A => 0x001A, #SUBSTITUTE
|
188
|
+
0x1B => 0x001B, #ESCAPE
|
189
|
+
0x1C => 0x001C, #FILE SEPARATOR
|
190
|
+
0x1D => 0x001D, #GROUP SEPARATOR
|
191
|
+
0x1E => 0x001E, #RECORD SEPARATOR
|
192
|
+
0x1F => 0x001F, #UNIT SEPARATOR
|
193
|
+
0x20 => 0x0020, #SPACE
|
194
|
+
0x21 => 0x0021, #EXCLAMATION MARK
|
195
|
+
0x22 => 0x0022, #QUOTATION MARK
|
196
|
+
0x23 => 0x0023, #NUMBER SIGN
|
197
|
+
0x24 => 0x0024, #DOLLAR SIGN
|
198
|
+
0x25 => 0x0025, #PERCENT SIGN
|
199
|
+
0x26 => 0x0026, #AMPERSAND
|
200
|
+
0x27 => 0x0027, #APOSTROPHE
|
201
|
+
0x28 => 0x0028, #LEFT PARENTHESIS
|
202
|
+
0x29 => 0x0029, #RIGHT PARENTHESIS
|
203
|
+
0x2A => 0x002A, #ASTERISK
|
204
|
+
0x2B => 0x002B, #PLUS SIGN
|
205
|
+
0x2C => 0x002C, #COMMA
|
206
|
+
0x2D => 0x002D, #HYPHEN-MINUS
|
207
|
+
0x2E => 0x002E, #FULL STOP
|
208
|
+
0x2F => 0x002F, #SOLIDUS
|
209
|
+
0x30 => 0x0030, #DIGIT ZERO
|
210
|
+
0x31 => 0x0031, #DIGIT ONE
|
211
|
+
0x32 => 0x0032, #DIGIT TWO
|
212
|
+
0x33 => 0x0033, #DIGIT THREE
|
213
|
+
0x34 => 0x0034, #DIGIT FOUR
|
214
|
+
0x35 => 0x0035, #DIGIT FIVE
|
215
|
+
0x36 => 0x0036, #DIGIT SIX
|
216
|
+
0x37 => 0x0037, #DIGIT SEVEN
|
217
|
+
0x38 => 0x0038, #DIGIT EIGHT
|
218
|
+
0x39 => 0x0039, #DIGIT NINE
|
219
|
+
0x3A => 0x003A, #COLON
|
220
|
+
0x3B => 0x003B, #SEMICOLON
|
221
|
+
0x3C => 0x003C, #LESS-THAN SIGN
|
222
|
+
0x3D => 0x003D, #EQUALS SIGN
|
223
|
+
0x3E => 0x003E, #GREATER-THAN SIGN
|
224
|
+
0x3F => 0x003F, #QUESTION MARK
|
225
|
+
0x40 => 0x0040, #COMMERCIAL AT
|
226
|
+
0x41 => 0x0041, #LATIN CAPITAL LETTER A
|
227
|
+
0x42 => 0x0042, #LATIN CAPITAL LETTER B
|
228
|
+
0x43 => 0x0043, #LATIN CAPITAL LETTER C
|
229
|
+
0x44 => 0x0044, #LATIN CAPITAL LETTER D
|
230
|
+
0x45 => 0x0045, #LATIN CAPITAL LETTER E
|
231
|
+
0x46 => 0x0046, #LATIN CAPITAL LETTER F
|
232
|
+
0x47 => 0x0047, #LATIN CAPITAL LETTER G
|
233
|
+
0x48 => 0x0048, #LATIN CAPITAL LETTER H
|
234
|
+
0x49 => 0x0049, #LATIN CAPITAL LETTER I
|
235
|
+
0x4A => 0x004A, #LATIN CAPITAL LETTER J
|
236
|
+
0x4B => 0x004B, #LATIN CAPITAL LETTER K
|
237
|
+
0x4C => 0x004C, #LATIN CAPITAL LETTER L
|
238
|
+
0x4D => 0x004D, #LATIN CAPITAL LETTER M
|
239
|
+
0x4E => 0x004E, #LATIN CAPITAL LETTER N
|
240
|
+
0x4F => 0x004F, #LATIN CAPITAL LETTER O
|
241
|
+
0x50 => 0x0050, #LATIN CAPITAL LETTER P
|
242
|
+
0x51 => 0x0051, #LATIN CAPITAL LETTER Q
|
243
|
+
0x52 => 0x0052, #LATIN CAPITAL LETTER R
|
244
|
+
0x53 => 0x0053, #LATIN CAPITAL LETTER S
|
245
|
+
0x54 => 0x0054, #LATIN CAPITAL LETTER T
|
246
|
+
0x55 => 0x0055, #LATIN CAPITAL LETTER U
|
247
|
+
0x56 => 0x0056, #LATIN CAPITAL LETTER V
|
248
|
+
0x57 => 0x0057, #LATIN CAPITAL LETTER W
|
249
|
+
0x58 => 0x0058, #LATIN CAPITAL LETTER X
|
250
|
+
0x59 => 0x0059, #LATIN CAPITAL LETTER Y
|
251
|
+
0x5A => 0x005A, #LATIN CAPITAL LETTER Z
|
252
|
+
0x5B => 0x005B, #LEFT SQUARE BRACKET
|
253
|
+
0x5C => 0x005C, #REVERSE SOLIDUS
|
254
|
+
0x5D => 0x005D, #RIGHT SQUARE BRACKET
|
255
|
+
0x5E => 0x005E, #CIRCUMFLEX ACCENT
|
256
|
+
0x5F => 0x005F, #LOW LINE
|
257
|
+
0x60 => 0x0060, #GRAVE ACCENT
|
258
|
+
0x61 => 0x0061, #LATIN SMALL LETTER A
|
259
|
+
0x62 => 0x0062, #LATIN SMALL LETTER B
|
260
|
+
0x63 => 0x0063, #LATIN SMALL LETTER C
|
261
|
+
0x64 => 0x0064, #LATIN SMALL LETTER D
|
262
|
+
0x65 => 0x0065, #LATIN SMALL LETTER E
|
263
|
+
0x66 => 0x0066, #LATIN SMALL LETTER F
|
264
|
+
0x67 => 0x0067, #LATIN SMALL LETTER G
|
265
|
+
0x68 => 0x0068, #LATIN SMALL LETTER H
|
266
|
+
0x69 => 0x0069, #LATIN SMALL LETTER I
|
267
|
+
0x6A => 0x006A, #LATIN SMALL LETTER J
|
268
|
+
0x6B => 0x006B, #LATIN SMALL LETTER K
|
269
|
+
0x6C => 0x006C, #LATIN SMALL LETTER L
|
270
|
+
0x6D => 0x006D, #LATIN SMALL LETTER M
|
271
|
+
0x6E => 0x006E, #LATIN SMALL LETTER N
|
272
|
+
0x6F => 0x006F, #LATIN SMALL LETTER O
|
273
|
+
0x70 => 0x0070, #LATIN SMALL LETTER P
|
274
|
+
0x71 => 0x0071, #LATIN SMALL LETTER Q
|
275
|
+
0x72 => 0x0072, #LATIN SMALL LETTER R
|
276
|
+
0x73 => 0x0073, #LATIN SMALL LETTER S
|
277
|
+
0x74 => 0x0074, #LATIN SMALL LETTER T
|
278
|
+
0x75 => 0x0075, #LATIN SMALL LETTER U
|
279
|
+
0x76 => 0x0076, #LATIN SMALL LETTER V
|
280
|
+
0x77 => 0x0077, #LATIN SMALL LETTER W
|
281
|
+
0x78 => 0x0078, #LATIN SMALL LETTER X
|
282
|
+
0x79 => 0x0079, #LATIN SMALL LETTER Y
|
283
|
+
0x7A => 0x007A, #LATIN SMALL LETTER Z
|
284
|
+
0x7B => 0x007B, #LEFT CURLY BRACKET
|
285
|
+
0x7C => 0x007C, #VERTICAL LINE
|
286
|
+
0x7D => 0x007D, #RIGHT CURLY BRACKET
|
287
|
+
0x7E => 0x007E, #TILDE
|
288
|
+
0x7F => 0x007F, #DELETE
|
289
|
+
0x80 => 0x20AC, #EURO SIGN
|
290
|
+
0x81 => nil, #UNDEFINED,
|
291
|
+
0x82 => 0x201A, #SINGLE LOW-9 QUOTATION MARK
|
292
|
+
0x83 => 0x0192, #LATIN SMALL LETTER F WITH HOOK
|
293
|
+
0x84 => 0x201E, #DOUBLE LOW-9 QUOTATION MARK
|
294
|
+
0x85 => 0x2026, #HORIZONTAL ELLIPSIS
|
295
|
+
0x86 => 0x2020, #DAGGER
|
296
|
+
0x87 => 0x2021, #DOUBLE DAGGER
|
297
|
+
0x88 => 0x02C6, #MODIFIER LETTER CIRCUMFLEX ACCENT
|
298
|
+
0x89 => 0x2030, #PER MILLE SIGN
|
299
|
+
0x8A => 0x0160, #LATIN CAPITAL LETTER S WITH CARON
|
300
|
+
0x8B => 0x2039, #SINGLE LEFT-POINTING ANGLE QUOTATION MARK
|
301
|
+
0x8C => 0x0152, #LATIN CAPITAL LIGATURE OE
|
302
|
+
0x8D => nil, #UNDEFINED,
|
303
|
+
0x8E => 0x017D, #LATIN CAPITAL LETTER Z WITH CARON
|
304
|
+
0x8F => nil, #UNDEFINED,
|
305
|
+
0x90 => nil, #UNDEFINED,
|
306
|
+
0x91 => 0x2018, #LEFT SINGLE QUOTATION MARK
|
307
|
+
0x92 => 0x2019, #RIGHT SINGLE QUOTATION MARK
|
308
|
+
0x93 => 0x201C, #LEFT DOUBLE QUOTATION MARK
|
309
|
+
0x94 => 0x201D, #RIGHT DOUBLE QUOTATION MARK
|
310
|
+
0x95 => 0x2022, #BULLET
|
311
|
+
0x96 => 0x2013, #EN DASH
|
312
|
+
0x97 => 0x2014, #EM DASH
|
313
|
+
0x98 => 0x02DC, #SMALL TILDE
|
314
|
+
0x99 => 0x2122, #TRADE MARK SIGN
|
315
|
+
0x9A => 0x0161, #LATIN SMALL LETTER S WITH CARON
|
316
|
+
0x9B => 0x203A, #SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
|
317
|
+
0x9C => 0x0153, #LATIN SMALL LIGATURE OE
|
318
|
+
0x9D => nil, #UNDEFINED,
|
319
|
+
0x9E => 0x017E, #LATIN SMALL LETTER Z WITH CARON
|
320
|
+
0x9F => 0x0178, #LATIN CAPITAL LETTER Y WITH DIAERESIS
|
321
|
+
0xA0 => 0x00A0, #NO-BREAK SPACE
|
322
|
+
0xA1 => 0x00A1, #INVERTED EXCLAMATION MARK
|
323
|
+
0xA2 => 0x00A2, #CENT SIGN
|
324
|
+
0xA3 => 0x00A3, #POUND SIGN
|
325
|
+
0xA4 => 0x00A4, #CURRENCY SIGN
|
326
|
+
0xA5 => 0x00A5, #YEN SIGN
|
327
|
+
0xA6 => 0x00A6, #BROKEN BAR
|
328
|
+
0xA7 => 0x00A7, #SECTION SIGN
|
329
|
+
0xA8 => 0x00A8, #DIAERESIS
|
330
|
+
0xA9 => 0x00A9, #COPYRIGHT SIGN
|
331
|
+
0xAA => 0x00AA, #FEMININE ORDINAL INDICATOR
|
332
|
+
0xAB => 0x00AB, #LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
|
333
|
+
0xAC => 0x00AC, #NOT SIGN
|
334
|
+
0xAD => 0x00AD, #SOFT HYPHEN
|
335
|
+
0xAE => 0x00AE, #REGISTERED SIGN
|
336
|
+
0xAF => 0x00AF, #MACRON
|
337
|
+
0xB0 => 0x00B0, #DEGREE SIGN
|
338
|
+
0xB1 => 0x00B1, #PLUS-MINUS SIGN
|
339
|
+
0xB2 => 0x00B2, #SUPERSCRIPT TWO
|
340
|
+
0xB3 => 0x00B3, #SUPERSCRIPT THREE
|
341
|
+
0xB4 => 0x00B4, #ACUTE ACCENT
|
342
|
+
0xB5 => 0x00B5, #MICRO SIGN
|
343
|
+
0xB6 => 0x00B6, #PILCROW SIGN
|
344
|
+
0xB7 => 0x00B7, #MIDDLE DOT
|
345
|
+
0xB8 => 0x00B8, #CEDILLA
|
346
|
+
0xB9 => 0x00B9, #SUPERSCRIPT ONE
|
347
|
+
0xBA => 0x00BA, #MASCULINE ORDINAL INDICATOR
|
348
|
+
0xBB => 0x00BB, #RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
|
349
|
+
0xBC => 0x00BC, #VULGAR FRACTION ONE QUARTER
|
350
|
+
0xBD => 0x00BD, #VULGAR FRACTION ONE HALF
|
351
|
+
0xBE => 0x00BE, #VULGAR FRACTION THREE QUARTERS
|
352
|
+
0xBF => 0x00BF, #INVERTED QUESTION MARK
|
353
|
+
0xC0 => 0x00C0, #LATIN CAPITAL LETTER A WITH GRAVE
|
354
|
+
0xC1 => 0x00C1, #LATIN CAPITAL LETTER A WITH ACUTE
|
355
|
+
0xC2 => 0x00C2, #LATIN CAPITAL LETTER A WITH CIRCUMFLEX
|
356
|
+
0xC3 => 0x00C3, #LATIN CAPITAL LETTER A WITH TILDE
|
357
|
+
0xC4 => 0x00C4, #LATIN CAPITAL LETTER A WITH DIAERESIS
|
358
|
+
0xC5 => 0x00C5, #LATIN CAPITAL LETTER A WITH RING ABOVE
|
359
|
+
0xC6 => 0x00C6, #LATIN CAPITAL LETTER AE
|
360
|
+
0xC7 => 0x00C7, #LATIN CAPITAL LETTER C WITH CEDILLA
|
361
|
+
0xC8 => 0x00C8, #LATIN CAPITAL LETTER E WITH GRAVE
|
362
|
+
0xC9 => 0x00C9, #LATIN CAPITAL LETTER E WITH ACUTE
|
363
|
+
0xCA => 0x00CA, #LATIN CAPITAL LETTER E WITH CIRCUMFLEX
|
364
|
+
0xCB => 0x00CB, #LATIN CAPITAL LETTER E WITH DIAERESIS
|
365
|
+
0xCC => 0x00CC, #LATIN CAPITAL LETTER I WITH GRAVE
|
366
|
+
0xCD => 0x00CD, #LATIN CAPITAL LETTER I WITH ACUTE
|
367
|
+
0xCE => 0x00CE, #LATIN CAPITAL LETTER I WITH CIRCUMFLEX
|
368
|
+
0xCF => 0x00CF, #LATIN CAPITAL LETTER I WITH DIAERESIS
|
369
|
+
0xD0 => 0x00D0, #LATIN CAPITAL LETTER ETH
|
370
|
+
0xD1 => 0x00D1, #LATIN CAPITAL LETTER N WITH TILDE
|
371
|
+
0xD2 => 0x00D2, #LATIN CAPITAL LETTER O WITH GRAVE
|
372
|
+
0xD3 => 0x00D3, #LATIN CAPITAL LETTER O WITH ACUTE
|
373
|
+
0xD4 => 0x00D4, #LATIN CAPITAL LETTER O WITH CIRCUMFLEX
|
374
|
+
0xD5 => 0x00D5, #LATIN CAPITAL LETTER O WITH TILDE
|
375
|
+
0xD6 => 0x00D6, #LATIN CAPITAL LETTER O WITH DIAERESIS
|
376
|
+
0xD7 => 0x00D7, #MULTIPLICATION SIGN
|
377
|
+
0xD8 => 0x00D8, #LATIN CAPITAL LETTER O WITH STROKE
|
378
|
+
0xD9 => 0x00D9, #LATIN CAPITAL LETTER U WITH GRAVE
|
379
|
+
0xDA => 0x00DA, #LATIN CAPITAL LETTER U WITH ACUTE
|
380
|
+
0xDB => 0x00DB, #LATIN CAPITAL LETTER U WITH CIRCUMFLEX
|
381
|
+
0xDC => 0x00DC, #LATIN CAPITAL LETTER U WITH DIAERESIS
|
382
|
+
0xDD => 0x00DD, #LATIN CAPITAL LETTER Y WITH ACUTE
|
383
|
+
0xDE => 0x00DE, #LATIN CAPITAL LETTER THORN
|
384
|
+
0xDF => 0x00DF, #LATIN SMALL LETTER SHARP S
|
385
|
+
0xE0 => 0x00E0, #LATIN SMALL LETTER A WITH GRAVE
|
386
|
+
0xE1 => 0x00E1, #LATIN SMALL LETTER A WITH ACUTE
|
387
|
+
0xE2 => 0x00E2, #LATIN SMALL LETTER A WITH CIRCUMFLEX
|
388
|
+
0xE3 => 0x00E3, #LATIN SMALL LETTER A WITH TILDE
|
389
|
+
0xE4 => 0x00E4, #LATIN SMALL LETTER A WITH DIAERESIS
|
390
|
+
0xE5 => 0x00E5, #LATIN SMALL LETTER A WITH RING ABOVE
|
391
|
+
0xE6 => 0x00E6, #LATIN SMALL LETTER AE
|
392
|
+
0xE7 => 0x00E7, #LATIN SMALL LETTER C WITH CEDILLA
|
393
|
+
0xE8 => 0x00E8, #LATIN SMALL LETTER E WITH GRAVE
|
394
|
+
0xE9 => 0x00E9, #LATIN SMALL LETTER E WITH ACUTE
|
395
|
+
0xEA => 0x00EA, #LATIN SMALL LETTER E WITH CIRCUMFLEX
|
396
|
+
0xEB => 0x00EB, #LATIN SMALL LETTER E WITH DIAERESIS
|
397
|
+
0xEC => 0x00EC, #LATIN SMALL LETTER I WITH GRAVE
|
398
|
+
0xED => 0x00ED, #LATIN SMALL LETTER I WITH ACUTE
|
399
|
+
0xEE => 0x00EE, #LATIN SMALL LETTER I WITH CIRCUMFLEX
|
400
|
+
0xEF => 0x00EF, #LATIN SMALL LETTER I WITH DIAERESIS
|
401
|
+
0xF0 => 0x00F0, #LATIN SMALL LETTER ETH
|
402
|
+
0xF1 => 0x00F1, #LATIN SMALL LETTER N WITH TILDE
|
403
|
+
0xF2 => 0x00F2, #LATIN SMALL LETTER O WITH GRAVE
|
404
|
+
0xF3 => 0x00F3, #LATIN SMALL LETTER O WITH ACUTE
|
405
|
+
0xF4 => 0x00F4, #LATIN SMALL LETTER O WITH CIRCUMFLEX
|
406
|
+
0xF5 => 0x00F5, #LATIN SMALL LETTER O WITH TILDE
|
407
|
+
0xF6 => 0x00F6, #LATIN SMALL LETTER O WITH DIAERESIS
|
408
|
+
0xF7 => 0x00F7, #DIVISION SIGN
|
409
|
+
0xF8 => 0x00F8, #LATIN SMALL LETTER O WITH STROKE
|
410
|
+
0xF9 => 0x00F9, #LATIN SMALL LETTER U WITH GRAVE
|
411
|
+
0xFA => 0x00FA, #LATIN SMALL LETTER U WITH ACUTE
|
412
|
+
0xFB => 0x00FB, #LATIN SMALL LETTER U WITH CIRCUMFLEX
|
413
|
+
0xFC => 0x00FC, #LATIN SMALL LETTER U WITH DIAERESIS
|
414
|
+
0xFD => 0x00FD, #LATIN SMALL LETTER Y WITH ACUTE
|
415
|
+
0xFE => 0x00FE, #LATIN SMALL LETTER THORN
|
416
|
+
0xFF => 0x00FF, #LATIN SMALL LETTER Y WITH DIAERESIS
|
417
|
+
# added by me for crashing VR
|
418
|
+
# http://unicode-search.net/unicode-namesearch.pl?term=hyphen
|
419
|
+
0X2010 => 0x2010, #HYPHEN
|
420
|
+
0X2011 => 0x2011, #NON BREAKING HYPHEN
|
421
|
+
0X2012 => 0x2012, #FIGURE DASH
|
422
|
+
|
423
|
+
|
424
|
+
}
|
425
|
+
|
426
|
+
# from http://redhanded.hobix.com/inspect/closingInOnUnicodeWithJcode.html
|
427
|
+
def self.utf_encode( str )
|
428
|
+
String.new str.gsub(/U\+([0-9a-fA-F]{4,4})/u){["#$1".hex ].pack('U*')}
|
429
|
+
end
|
430
|
+
|
431
|
+
|
432
|
+
UTF8_CHARS = { nil => ""}
|
433
|
+
# Too lazy to manually create a table
|
434
|
+
# map the cps1252 to the unicode values
|
435
|
+
CP1252_CHARS.each { |cps_code, values|
|
436
|
+
|
437
|
+
if CP1252_MAP[cps_code]
|
438
|
+
hex_value = sprintf( '%04X', CP1252_MAP[cps_code])
|
439
|
+
plain_ascii = values[0]
|
440
|
+
# puts "#{cps_code}, #{values.inspect}, #{CP1252_MAP[cps_code]} #{hex_value}"
|
441
|
+
else
|
442
|
+
hex_value = sprintf( '%04X', cps_code)
|
443
|
+
plain_ascii = ""
|
444
|
+
end
|
445
|
+
utf_encoded_value = utf_encode "U+#{hex_value}"
|
446
|
+
# puts "\thex_value = #{hex_value} utf_encoded_value = #{utf_encoded_value} #{utf_encode "U+2012"}"
|
447
|
+
UTF8_CHARS[Regexp.new( utf_encoded_value)] = plain_ascii
|
448
|
+
}
|
449
|
+
|
450
|
+
|
451
|
+
def self.demoronize( orig_str)
|
452
|
+
# hex_rep_ary = []
|
453
|
+
# orig_str.each_byte { |b| hex_rep_ary << sprintf( "%x", b) }
|
454
|
+
# hex_rep = hex_rep_ary.join( " ")
|
455
|
+
string = orig_str.dup
|
456
|
+
UTF8_CHARS.each { |regex, value|
|
457
|
+
# puts " regex: #{regex} value: #{value} orig: #{orig_str} : #{hex_rep}"
|
458
|
+
if string =~ regex
|
459
|
+
# puts "subbing #{regex} for #{value}"
|
460
|
+
string.gsub!( regex, value)
|
461
|
+
end
|
462
|
+
}
|
463
|
+
string
|
464
|
+
end
|
465
|
+
end
|
@@ -0,0 +1,32 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
require 'asciidammit'
|
3
|
+
describe Asciidammit do
|
4
|
+
# keep this for handy string w/lots of symbols
|
5
|
+
it "should demoronize the special characters provided by word" do
|
6
|
+
Asciidammit.demoronize( "—–‑ ©®™§¶…‘’“”" ).should == "---- (c)[R][TM]SP...''\"\""
|
7
|
+
end
|
8
|
+
# the MS Word provided symbols 1 at a time
|
9
|
+
[
|
10
|
+
["—", "--" ],
|
11
|
+
["–", "-"],
|
12
|
+
["‑", "-"],
|
13
|
+
["", " "],
|
14
|
+
[" ", " "],
|
15
|
+
["©", "(c)"],
|
16
|
+
["®", "[R]"],
|
17
|
+
["™", "[TM]"],
|
18
|
+
["§", "S"],
|
19
|
+
["¶", "P"],
|
20
|
+
["…", "..."],
|
21
|
+
["‘", "'"],
|
22
|
+
["’", "'"],
|
23
|
+
["“", "\""],
|
24
|
+
["”", "\""],
|
25
|
+
|
26
|
+
].each { |ms_symbol, ascii|
|
27
|
+
it "should convert #{ms_symbol} to #{ascii}" do
|
28
|
+
expect(Asciidammit.demoronize( ms_symbol)).to eq ascii
|
29
|
+
end
|
30
|
+
}
|
31
|
+
|
32
|
+
end
|
metadata
ADDED
@@ -0,0 +1,113 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: asciidammit2
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Forrest Chang
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-09-18 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: bundler
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ~>
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '1.7'
|
20
|
+
type: :development
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ~>
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '1.7'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: rake
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - ~>
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: '10.0'
|
34
|
+
type: :development
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ~>
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '10.0'
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: rspec
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - '>='
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '0'
|
48
|
+
type: :development
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: rspec-nc
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - '>='
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '0'
|
62
|
+
type: :development
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - '>='
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '0'
|
69
|
+
description: ' A straight port of asciidammit.py used for a work project, made into
|
70
|
+
a gem to be used elsewhere. There is an asciidammit gem which sports a different
|
71
|
+
interface.'
|
72
|
+
email:
|
73
|
+
- fkc_email-ruby@yahoo.com
|
74
|
+
executables: []
|
75
|
+
extensions: []
|
76
|
+
extra_rdoc_files: []
|
77
|
+
files:
|
78
|
+
- .gitignore
|
79
|
+
- Gemfile
|
80
|
+
- LICENSE.txt
|
81
|
+
- README.md
|
82
|
+
- Rakefile
|
83
|
+
- asciidammit2.gemspec
|
84
|
+
- lib/asciidammit.rb
|
85
|
+
- lib/asciidammit/version.rb
|
86
|
+
- spec/asciidammit_spec.rb
|
87
|
+
homepage: https://github.com/fkchang/asciidammit2
|
88
|
+
licenses:
|
89
|
+
- MIT
|
90
|
+
metadata: {}
|
91
|
+
post_install_message:
|
92
|
+
rdoc_options: []
|
93
|
+
require_paths:
|
94
|
+
- lib
|
95
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
96
|
+
requirements:
|
97
|
+
- - '>='
|
98
|
+
- !ruby/object:Gem::Version
|
99
|
+
version: '0'
|
100
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
101
|
+
requirements:
|
102
|
+
- - '>='
|
103
|
+
- !ruby/object:Gem::Version
|
104
|
+
version: '0'
|
105
|
+
requirements: []
|
106
|
+
rubyforge_project:
|
107
|
+
rubygems_version: 2.0.6
|
108
|
+
signing_key:
|
109
|
+
specification_version: 4
|
110
|
+
summary: A straight port of asciidammit.py.
|
111
|
+
test_files:
|
112
|
+
- spec/asciidammit_spec.rb
|
113
|
+
has_rdoc:
|