mosaheh 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/Guardfile +5 -0
- data/lib/mosaheh.rb +2 -0
- data/lib/mosaheh/core_ext.rb +3 -0
- data/lib/mosaheh/core_ext/string.rb +17 -0
- data/lib/mosaheh/encoder.rb +62 -97
- data/lib/mosaheh/version.rb +1 -1
- data/mosaheh.gemspec +6 -1
- data/spec/lib/mosaheh/core_ext/string_spec.rb +15 -0
- data/spec/lib/mosaheh/encoder_spec.rb +25 -0
- data/spec/misencoded_samples/ar.txt +1 -0
- data/spec/spec_helper.rb +8 -0
- metadata +54 -5
- data/spec/mosaheh/encoder_spec.rb +0 -17
data/Guardfile
ADDED
data/lib/mosaheh.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class String
|
3
|
+
|
4
|
+
# Removes the first charecter from the String
|
5
|
+
# and returns it back
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# str = 'abc'
|
9
|
+
# str.shift! # => 'a'
|
10
|
+
# p str # => 'bc'
|
11
|
+
# @return [String] The removed first charecter
|
12
|
+
def shift!
|
13
|
+
char = self[0]
|
14
|
+
self[0] = ''
|
15
|
+
char
|
16
|
+
end
|
17
|
+
end
|
data/lib/mosaheh/encoder.rb
CHANGED
@@ -1,118 +1,83 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
class Mosaheh::Encoder
|
4
|
-
|
5
|
-
# Arabic UTF-8 Block, from U+0600 to U+06FF seperated using '_*_'
|
6
|
-
AR = '_*__*__*__*__*__*_؆_*_؇_*_؈_*_؉_*_؊_*_؋_*_،_*_؍_*_؎_*_؏_*_ؐ_*_ؑ_*_ؒ_*_ؓ_*_ؔ_*_ؕ_*_ؖ_*_ؗ_*_ؘ_*_ؙ_*_ؚ_*_؛_*__*_؝_*_؞_*_؟_*_ؠ_*_ء_*_آ_*_أ_*_ؤ_*_إ_*_ئ_*_ا_*_ب_*_ة_*_ت_*_ث_*_ج_*_ح_*_خ_*_د_*_ذ_*_ر_*_ز_*_س_*_ش_*_ص_*_ض_*_ط_*_ظ_*_ع_*_غ_*_ػ_*_ؼ_*_ؽ_*_ؾ_*_ؿ_*_ـ_*_ف_*_ق_*_ك_*_ل_*_م_*_ن_*_ه_*_و_*_ى_*_ي_*_ً_*_ٌ_*_ٍ_*_َ_*_ُ_*_ِ_*_ّ_*_ْ_*_ٓ_*_ٔ_*_ٕ_*_ٖ_*_ٗ_*_٘_*_ٙ_*_ٚ_*_ٛ_*_ٜ_*_ٝ_*_ٞ_*_ٟ_*_٠_*_١_*_٢_*_٣_*_٤_*_٥_*_٦_*_٧_*_٨_*_٩_*_٪_*_٫_*_٬_*_٭_*_ٮ_*_ٯ_*_ٰ_*_ٱ_*_ٲ_*_ٳ_*_ٴ_*_ٵ_*_ٶ_*_ٷ_*_ٸ_*_ٹ_*_ٺ_*_ٻ_*_ټ_*_ٽ_*_پ_*_ٿ_*_ڀ_*_ځ_*_ڂ_*_ڃ_*_ڄ_*_څ_*_چ_*_ڇ_*_ڈ_*_ډ_*_ڊ_*_ڋ_*_ڌ_*_ڍ_*_ڎ_*_ڏ_*_ڐ_*_ڑ_*_ڒ_*_ړ_*_ڔ_*_ڕ_*_ږ_*_ڗ_*_ژ_*_ڙ_*_ښ_*_ڛ_*_ڜ_*_ڝ_*_ڞ_*_ڟ_*_ڠ_*_ڡ_*_ڢ_*_ڣ_*_ڤ_*_ڥ_*_ڦ_*_ڧ_*_ڨ_*_ک_*_ڪ_*_ګ_*_ڬ_*_ڭ_*_ڮ_*_گ_*_ڰ_*_ڱ_*_ڲ_*_ڳ_*_ڴ_*_ڵ_*_ڶ_*_ڷ_*_ڸ_*_ڹ_*_ں_*_ڻ_*_ڼ_*_ڽ_*_ھ_*_ڿ_*_ۀ_*_ہ_*_ۂ_*_ۃ_*_ۄ_*_ۅ_*_ۆ_*_ۇ_*_ۈ_*_ۉ_*_ۊ_*_ۋ_*_ی_*_ۍ_*_ێ_*_ۏ_*_ې_*_ۑ_*_ے_*_ۓ_*_۔_*_ە_*_ۖ_*_ۗ_*_ۘ_*_ۙ_*_ۚ_*_ۛ_*_ۜ_*__*_۞_*_۟_*_۠_*_ۡ_*_ۢ_*_ۣ_*_ۤ_*_ۥ_*_ۦ_*_ۧ_*_ۨ_*_۩_*_۪_*_۫_*_۬_*_ۭ_*_ۮ_*_ۯ_*_۰_*_۱_*_۲_*_۳_*_۴_*_۵_*_۶_*_۷_*_۸_*_۹_*_ۺ_*_ۻ_*_ۼ_*_۽_*_۾_*_ۿ'
|
7
4
|
|
8
|
-
#
|
9
|
-
|
5
|
+
# Initialize the encoder
|
6
|
+
def initialize
|
7
|
+
# UTF-8 bytes-sequences always begin with one of (0xD8 - 0xDB) for Arabic
|
8
|
+
@utf_8_beginning_chars = [*216..219].map(&:chr).join.force_encoding('cp1252')
|
10
9
|
|
11
|
-
|
12
|
-
|
10
|
+
# Misencoded sequences can be correctly re-encoded to utf-8, EXCEPT for one
|
11
|
+
# charecter with gets replaced with a space (ASCII for it: 32)!
|
12
|
+
@problem_char = 32.chr.force_encoding('cp1252')
|
13
13
|
|
14
|
-
|
14
|
+
# The correct replacement for the problem charecter
|
15
|
+
@correct_char = 160.chr.force_encoding('cp1252')
|
15
16
|
end
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
@map.each do |broken, good|
|
24
|
-
|
25
|
-
str = "[#{broken.join(', ')}]"
|
26
|
-
|
27
|
-
justification = str.length if str.length > justification
|
28
|
-
str = str.ljust justification
|
29
|
-
|
30
|
-
str += ' => '
|
31
|
-
str += "[#{good.join(', ')}]"
|
32
|
-
str += " # #{ar[i]}\n"
|
33
|
-
|
34
|
-
mappings += str
|
35
|
-
|
36
|
-
i += 1
|
37
|
-
end
|
38
|
-
|
39
|
-
puts mappings
|
40
|
-
end
|
41
|
-
|
17
|
+
|
18
|
+
# Repairs Arabic (U+0600 - U+06FF) data
|
19
|
+
# which has been misencoded from cp1252 to UTF-8
|
20
|
+
# although the original data was UTF-8 encoded
|
21
|
+
#
|
22
|
+
# @param [String] Misencoded string
|
23
|
+
# @return [String] Correctly encoded utf-8 string
|
42
24
|
def repair(str)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
25
|
+
|
26
|
+
# Data buffers
|
27
|
+
source = str.clone
|
28
|
+
fixed = ""
|
29
|
+
|
30
|
+
# Each string needs a new converter instance
|
31
|
+
ec = Encoding::Converter.new('utf-8', 'cp1252')
|
32
|
+
|
33
|
+
until source.empty?
|
47
34
|
|
48
|
-
#
|
49
|
-
#
|
50
|
-
|
35
|
+
# Don't process correctly UTF-8
|
36
|
+
# encoded Arabic data
|
37
|
+
if is_arabic?(source[0])
|
38
|
+
fixed += source.shift!.force_encoding('cp1252')
|
39
|
+
next
|
40
|
+
end
|
51
41
|
|
52
|
-
|
53
|
-
|
54
|
-
|
42
|
+
state = ec.primitive_convert(source, fixed, nil, nil, Encoding::Converter::AFTER_OUTPUT)
|
43
|
+
|
44
|
+
# When an undefined sequence is found, we only move the
|
45
|
+
# 2nd byte to the fixed data, as it will be valid in UTF-8.
|
46
|
+
# For example: 129 is undefined in cp1252, but it is in UTF-8.
|
47
|
+
# If we get a sequence like:
|
48
|
+
# ec.last_error.error_char.unpack('C*') # => [194, 129]
|
49
|
+
# We just ignore the 194 and add the 129 to the fixed data
|
50
|
+
if state == :undefined_conversion
|
51
|
+
c = ec.last_error.error_char.unpack('C*')[1].chr
|
52
|
+
fixed += c.force_encoding('cp1252')
|
53
|
+
end
|
55
54
|
|
56
|
-
|
55
|
+
# After each byte gets converted, check for the problem charecter
|
56
|
+
# and replace it if it's found
|
57
|
+
if state == :after_output && ends_with_problem?(fixed)
|
58
|
+
fixed.gsub!(/#{@problem_char}$/, @correct_char)
|
59
|
+
end
|
57
60
|
end
|
58
61
|
|
59
|
-
|
60
|
-
@repaired = []
|
61
|
-
result
|
62
|
+
fixed.force_encoding('utf-8')
|
62
63
|
end
|
63
64
|
|
64
65
|
private
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def sequence_found_in_map
|
76
|
-
(1..4).each do |i|
|
77
|
-
broken_seq = @broken[0..i]
|
78
|
-
if @map.has_key?(broken_seq)
|
79
|
-
@repaired += @map[broken_seq]
|
80
|
-
@broken.slice!(0, i + 1)
|
81
|
-
return true
|
82
|
-
end
|
83
|
-
end
|
84
|
-
false
|
67
|
+
# Used to test for correctly encoded UTF-8 Arabic
|
68
|
+
#
|
69
|
+
# @param [String] Data to test
|
70
|
+
# @return [Boolean]
|
71
|
+
def is_arabic?(str)
|
72
|
+
str =~ %r{([\u0600-\u06FF])+}u
|
85
73
|
end
|
86
74
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
)
|
95
|
-
# Add the byte to the repaired sequence and remove it from the broken one
|
96
|
-
@repaired << @broken.shift
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
def handle_unknown_byte
|
101
|
-
case
|
102
|
-
# Handle the case when the last 2 bytes are the beginning of
|
103
|
-
# a broken sequence but it's not found in the mappings hash.
|
104
|
-
# The best guess is that they're 2 one-byte chars.
|
105
|
-
#
|
106
|
-
# Handles: [195, (152 | 153 | 154 | 155)]
|
107
|
-
when @broken.length == 2
|
108
|
-
@repaired << @broken.first << @broken[1]
|
109
|
-
@broken.slice!(0, 2)
|
110
|
-
|
111
|
-
# If we are here, then we have no idea what is this byte!
|
112
|
-
# We will just use the replace_char to replace the unknown byte_not_broken
|
113
|
-
# in the repaired sequence
|
114
|
-
else
|
115
|
-
@repaired += @replace_char.bytes.to_a
|
116
|
-
end
|
75
|
+
# Used to check for the problem char in the end
|
76
|
+
# of a given String
|
77
|
+
#
|
78
|
+
# @param [String] Data to test
|
79
|
+
# @return [Boolean]
|
80
|
+
def ends_with_problem?(str)
|
81
|
+
str =~ %r{[#{@utf_8_beginning_chars}]#{@problem_char}$}
|
117
82
|
end
|
118
83
|
end
|
data/lib/mosaheh/version.rb
CHANGED
data/mosaheh.gemspec
CHANGED
@@ -18,5 +18,10 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
|
21
|
-
s.add_development_dependency
|
21
|
+
s.add_development_dependency 'rspec'
|
22
|
+
s.add_development_dependency 'guard'
|
23
|
+
s.add_development_dependency 'guard-rspec'
|
24
|
+
# Linux notifications for guard
|
25
|
+
s.add_development_dependency 'rb-inotify'
|
26
|
+
s.add_development_dependency 'libnotify'
|
22
27
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path('../../../../spec_helper', __FILE__)
|
3
|
+
|
4
|
+
describe String, '#shift!' do
|
5
|
+
let(:string) { 'abc' }
|
6
|
+
|
7
|
+
it 'should remove the first charecter' do
|
8
|
+
string.shift!
|
9
|
+
string.should == 'bc'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should return the removed charecter' do
|
13
|
+
string.shift!.should == 'a'
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path('../../../spec_helper', __FILE__)
|
3
|
+
|
4
|
+
describe Mosaheh::Encoder do
|
5
|
+
let(:encoder) { Mosaheh::Encoder.new }
|
6
|
+
|
7
|
+
describe '#repair' do
|
8
|
+
it 'should repair the whole Arabic unicode codeblock' do
|
9
|
+
# Arabic: U+0060 - U+06FF
|
10
|
+
good_ar = (0..255).map{|i| eval '"\u06' + ("%02x" % i) + '"'}.join ' '
|
11
|
+
broken_ar = load_misencoded_ar
|
12
|
+
encoder.repair(broken_ar).should == good_ar
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not change ASCII chars' do
|
16
|
+
text = [*0..127].map(&:chr).join
|
17
|
+
encoder.repair(text).should == text
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should not change correctly encoded Arabic chars' do
|
21
|
+
text = 'إختبار'
|
22
|
+
encoder.repair(text).should == text
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
Ø€ Ø Ø‚ ؃ Ø„ Ø… ؆ ؇ ؈ ؉ ØŠ Ø‹ ØŒ Ø ØŽ Ø Ø Ø‘ Ø’ Ø“ Ø” Ø• Ø– Ø— ؘ Ø™ Øš Ø› Øœ Ø Øž ØŸ Ø Ø¡ Ø¢ Ø£ ؤ Ø¥ ئ ا ب Ø© ت Ø« ج Ø Ø® د Ø° ر ز س Ø´ ص ض Ø· ظ ع غ Ø» ؼ ؽ ؾ Ø¿ Ù€ Ù Ù‚ Ùƒ Ù„ Ù… Ù† Ù‡ Ùˆ Ù‰ ÙŠ Ù‹ ÙŒ Ù ÙŽ Ù Ù Ù‘ Ù’ Ù“ Ù” Ù• Ù– Ù— Ù˜ Ù™ Ùš Ù› Ùœ Ù Ùž ÙŸ Ù Ù¡ Ù¢ Ù£ Ù¤ Ù¥ Ù¦ Ù§ Ù¨ Ù© Ùª Ù« Ù¬ Ù Ù® Ù¯ Ù° Ù± Ù² Ù³ Ù´ Ùµ Ù¶ Ù· Ù¸ Ù¹ Ùº Ù» Ù¼ Ù½ Ù¾ Ù¿ Ú€ Ú Ú‚ Úƒ Ú„ Ú… Ú† Ú‡ Úˆ Ú‰ ÚŠ Ú‹ ÚŒ Ú ÚŽ Ú Ú Ú‘ Ú’ Ú“ Ú” Ú• Ú– Ú— Ú˜ Ú™ Úš Ú› Úœ Ú Úž ÚŸ Ú Ú¡ Ú¢ Ú£ Ú¤ Ú¥ Ú¦ Ú§ Ú¨ Ú© Úª Ú« Ú¬ Ú Ú® Ú¯ Ú° Ú± Ú² Ú³ Ú´ Úµ Ú¶ Ú· Ú¸ Ú¹ Úº Ú» Ú¼ Ú½ Ú¾ Ú¿ Û€ Û Û‚ Ûƒ Û„ Û… Û† Û‡ Ûˆ Û‰ ÛŠ Û‹ ÛŒ Û ÛŽ Û Û Û‘ Û’ Û“ Û” Û• Û– Û— Û˜ Û™ Ûš Û› Ûœ Û Ûž ÛŸ Û Û¡ Û¢ Û£ Û¤ Û¥ Û¦ Û§ Û¨ Û© Ûª Û« Û¬ Û Û® Û¯ Û° Û± Û² Û³ Û´ Ûµ Û¶ Û· Û¸ Û¹ Ûº Û» Û¼ Û½ Û¾ Û¿
|
data/spec/spec_helper.rb
CHANGED
@@ -7,3 +7,11 @@ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
|
|
7
7
|
$LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
|
8
8
|
|
9
9
|
require 'mosaheh'
|
10
|
+
|
11
|
+
def load_misencoded_sample(filename)
|
12
|
+
File.read("#{File.dirname(__FILE__)}/misencoded_samples/#{filename}").chomp
|
13
|
+
end
|
14
|
+
|
15
|
+
def load_misencoded_ar
|
16
|
+
load_misencoded_sample('ar.txt')
|
17
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mosaheh
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-11-05 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &18315980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,51 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *18315980
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: guard
|
27
|
+
requirement: &18315480 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *18315480
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: guard-rspec
|
38
|
+
requirement: &18314980 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *18314980
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rb-inotify
|
49
|
+
requirement: &18314480 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *18314480
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: libnotify
|
60
|
+
requirement: &18313920 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *18313920
|
25
69
|
description: Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly
|
26
70
|
saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to
|
27
71
|
repair Arabic data stored in MySql databases with the wrong encoding.
|
@@ -33,14 +77,19 @@ extra_rdoc_files: []
|
|
33
77
|
files:
|
34
78
|
- .gitignore
|
35
79
|
- Gemfile
|
80
|
+
- Guardfile
|
36
81
|
- README.md
|
37
82
|
- Rakefile
|
38
83
|
- lib/mosaheh.rb
|
84
|
+
- lib/mosaheh/core_ext.rb
|
85
|
+
- lib/mosaheh/core_ext/string.rb
|
39
86
|
- lib/mosaheh/encoder.rb
|
40
87
|
- lib/mosaheh/version.rb
|
41
88
|
- mosaheh.gemspec
|
42
89
|
- spec/.rspec
|
43
|
-
- spec/mosaheh/
|
90
|
+
- spec/lib/mosaheh/core_ext/string_spec.rb
|
91
|
+
- spec/lib/mosaheh/encoder_spec.rb
|
92
|
+
- spec/misencoded_samples/ar.txt
|
44
93
|
- spec/spec_helper.rb
|
45
94
|
homepage: ''
|
46
95
|
licenses: []
|
@@ -1,17 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
require File.expand_path('../../spec_helper', __FILE__)
|
3
|
-
|
4
|
-
describe Mosaheh::Encoder do
|
5
|
-
let(:encoder) { Mosaheh::Encoder.new }
|
6
|
-
|
7
|
-
describe '#repair' do
|
8
|
-
it 'should repair the whole Arabic unicode codeblock' do
|
9
|
-
encoder.repair(Mosaheh::Encoder::BROKEN_AR).unpack('C*').should eq Mosaheh::Encoder::AR.unpack('C*')
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'should not change ASCII chars' do
|
13
|
-
text = [*0..127].map(&:chr).join
|
14
|
-
encoder.repair(text).should == text
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|