asciify 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/lib/asciify.rb +107 -0
- data/lib/mappings/default.yaml +27 -0
- metadata +41 -0
data/lib/asciify.rb
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
#!/usr/bin/ruby
|
|
2
|
+
|
|
3
|
+
require 'iconv'
|
|
4
|
+
require 'enumerator'
|
|
5
|
+
require 'yaml'
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class Asciify
|
|
9
|
+
Intermediate = "UCS-4"
|
|
10
|
+
PackFormat = "N*"
|
|
11
|
+
|
|
12
|
+
class Mapping
|
|
13
|
+
|
|
14
|
+
# converts an UTF-8 string to an array of unicode codepoints
|
|
15
|
+
#
|
|
16
|
+
def from_utf8(str)
|
|
17
|
+
Iconv.new(Intermediate,"UTF-8").iconv(str).unpack(PackFormat)
|
|
18
|
+
end
|
|
19
|
+
|
|
20
|
+
# define a mapping from Unicode codepoints to ASCII chars
|
|
21
|
+
# +language+ can be a path to a yaml file which contains the
|
|
22
|
+
# mappings as a Hash
|
|
23
|
+
#
|
|
24
|
+
# If +language+ is a symbol, it refers to a builtin mapping
|
|
25
|
+
#
|
|
26
|
+
def initialize(language = :default, replacement = "?")
|
|
27
|
+
|
|
28
|
+
if Symbol === language
|
|
29
|
+
path = "#{File.dirname(__FILE__)}/mappings/#{language}.yaml"
|
|
30
|
+
else
|
|
31
|
+
path = language
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
h = YAML.load_file(path)
|
|
35
|
+
i = Iconv.new("UCS-4","UTF-8")
|
|
36
|
+
|
|
37
|
+
# use the default replacement if the hash
|
|
38
|
+
@map = Hash.new( i.iconv(replacement).unpack(PackFormat) )
|
|
39
|
+
|
|
40
|
+
# the mappings file is UTF-8, recode to UCS-4
|
|
41
|
+
h.each { |k,v|
|
|
42
|
+
@map[*i.iconv(k).unpack(PackFormat)] = i.iconv(v).unpack(PackFormat)
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
@map
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
def [](codepoint)
|
|
49
|
+
@map[codepoint]
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
end
|
|
53
|
+
|
|
54
|
+
class HTMLEntities < Mapping
|
|
55
|
+
|
|
56
|
+
# mapping from Unicode codepoints to numeric HTML entities
|
|
57
|
+
#
|
|
58
|
+
# Asciify.new(Asciify::HTMLEntities).convert("\303\244") #=> "ä"
|
|
59
|
+
#
|
|
60
|
+
def [](codepoint)
|
|
61
|
+
from_utf8 "&##{codepoint};"
|
|
62
|
+
end
|
|
63
|
+
end
|
|
64
|
+
|
|
65
|
+
def initialize(replacement = "?", target = "ASCII", source = "UTF-8")
|
|
66
|
+
@from_input_enc = Iconv.new(Intermediate, source)
|
|
67
|
+
@to_output_enc = Iconv.new(target, Intermediate)
|
|
68
|
+
|
|
69
|
+
if String === replacement
|
|
70
|
+
r = @from_input_enc.iconv(replacement).unpack(PackFormat)
|
|
71
|
+
@mapping = Hash.new(r)
|
|
72
|
+
else
|
|
73
|
+
@mapping = replacement
|
|
74
|
+
end
|
|
75
|
+
end
|
|
76
|
+
|
|
77
|
+
def convert(str)
|
|
78
|
+
u16s = @from_input_enc.iconv(str)
|
|
79
|
+
|
|
80
|
+
s = u16s.unpack(PackFormat).collect { |codepoint|
|
|
81
|
+
codepoint < 128 ? codepoint : @mapping[codepoint]
|
|
82
|
+
}.flatten.compact.pack(PackFormat)
|
|
83
|
+
|
|
84
|
+
return @to_output_enc.iconv(s)
|
|
85
|
+
end
|
|
86
|
+
|
|
87
|
+
end
|
|
88
|
+
|
|
89
|
+
class String
|
|
90
|
+
|
|
91
|
+
# removes all characters which are not part of ascii
|
|
92
|
+
# and replaces them with +replacement+
|
|
93
|
+
#
|
|
94
|
+
# +replacement+ is supposed to be the same encoding as +source+
|
|
95
|
+
#
|
|
96
|
+
def asciify(*args)
|
|
97
|
+
Asciify.new(*args).convert(self)
|
|
98
|
+
end
|
|
99
|
+
|
|
100
|
+
def ascii?
|
|
101
|
+
self.to_enum(:each_byte).all? { |b| b < 128 }
|
|
102
|
+
end
|
|
103
|
+
end
|
|
104
|
+
|
|
105
|
+
if __FILE__ == $0
|
|
106
|
+
end
|
|
107
|
+
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
---
|
|
2
|
+
"“": '"'
|
|
3
|
+
"”": '"'
|
|
4
|
+
"‘": "'"
|
|
5
|
+
"’": "'"
|
|
6
|
+
"„": '"'
|
|
7
|
+
"〝": '"'
|
|
8
|
+
"〞": '"'
|
|
9
|
+
"»": ">>"
|
|
10
|
+
"«": "<<"
|
|
11
|
+
"ä": "ae"
|
|
12
|
+
"ö": "oe"
|
|
13
|
+
"ü": "ue"
|
|
14
|
+
"Ä": "Ae"
|
|
15
|
+
"Ö": "Oe"
|
|
16
|
+
"Ü": "Ue"
|
|
17
|
+
"ß": "ss"
|
|
18
|
+
"æ": "ae"
|
|
19
|
+
"Æ": "AE"
|
|
20
|
+
"œ": "oe"
|
|
21
|
+
"Œ": "OE"
|
|
22
|
+
"€": "EUR"
|
|
23
|
+
"½": "1/2"
|
|
24
|
+
"¼": "1/4"
|
|
25
|
+
"¾": "3/4"
|
|
26
|
+
"©": "(c)"
|
|
27
|
+
"®": "(r)"
|
metadata
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
|
2
|
+
rubygems_version: 0.8.11
|
|
3
|
+
specification_version: 1
|
|
4
|
+
name: asciify
|
|
5
|
+
version: !ruby/object:Gem::Version
|
|
6
|
+
version: 0.1.0
|
|
7
|
+
date: 2006-01-24 00:00:00 +01:00
|
|
8
|
+
summary: Tool to strip non-ASCII characters from a string and replace them with something else
|
|
9
|
+
require_paths:
|
|
10
|
+
- lib
|
|
11
|
+
email: levin@grundeis.net
|
|
12
|
+
homepage: http://levinalex.de/ruby/asciify
|
|
13
|
+
rubyforge_project:
|
|
14
|
+
description:
|
|
15
|
+
autorequire:
|
|
16
|
+
default_executable:
|
|
17
|
+
bindir: bin
|
|
18
|
+
has_rdoc: true
|
|
19
|
+
required_ruby_version: !ruby/object:Gem::Version::Requirement
|
|
20
|
+
requirements:
|
|
21
|
+
-
|
|
22
|
+
- ">"
|
|
23
|
+
- !ruby/object:Gem::Version
|
|
24
|
+
version: 0.0.0
|
|
25
|
+
version:
|
|
26
|
+
platform: ruby
|
|
27
|
+
signing_key:
|
|
28
|
+
cert_chain:
|
|
29
|
+
authors:
|
|
30
|
+
- Levin Alexander
|
|
31
|
+
files:
|
|
32
|
+
- lib/asciify.rb
|
|
33
|
+
- lib/mappings
|
|
34
|
+
- lib/mappings/default.yaml
|
|
35
|
+
test_files: []
|
|
36
|
+
rdoc_options: []
|
|
37
|
+
extra_rdoc_files: []
|
|
38
|
+
executables: []
|
|
39
|
+
extensions: []
|
|
40
|
+
requirements: []
|
|
41
|
+
dependencies: []
|