mosaheh 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Guardfile +5 -0
- data/lib/mosaheh.rb +2 -0
- data/lib/mosaheh/core_ext.rb +3 -0
- data/lib/mosaheh/core_ext/string.rb +17 -0
- data/lib/mosaheh/encoder.rb +62 -97
- data/lib/mosaheh/version.rb +1 -1
- data/mosaheh.gemspec +6 -1
- data/spec/lib/mosaheh/core_ext/string_spec.rb +15 -0
- data/spec/lib/mosaheh/encoder_spec.rb +25 -0
- data/spec/misencoded_samples/ar.txt +1 -0
- data/spec/spec_helper.rb +8 -0
- metadata +54 -5
- data/spec/mosaheh/encoder_spec.rb +0 -17
data/Guardfile
ADDED
data/lib/mosaheh.rb
CHANGED
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: utf-8
|
2
|
+
class String
|
3
|
+
|
4
|
+
# Removes the first charecter from the String
|
5
|
+
# and returns it back
|
6
|
+
#
|
7
|
+
# @example
|
8
|
+
# str = 'abc'
|
9
|
+
# str.shift! # => 'a'
|
10
|
+
# p str # => 'bc'
|
11
|
+
# @return [String] The removed first charecter
|
12
|
+
def shift!
|
13
|
+
char = self[0]
|
14
|
+
self[0] = ''
|
15
|
+
char
|
16
|
+
end
|
17
|
+
end
|
data/lib/mosaheh/encoder.rb
CHANGED
@@ -1,118 +1,83 @@
|
|
1
1
|
# encoding: UTF-8
|
2
2
|
|
3
3
|
class Mosaheh::Encoder
|
4
|
-
|
5
|
-
# Arabic UTF-8 Block, from U+0600 to U+06FF seperated using '_*_'
|
6
|
-
AR = '_*__*__*__*__*__*_؆_*_؇_*_؈_*_؉_*_؊_*_؋_*_،_*_؍_*_؎_*_؏_*_ؐ_*_ؑ_*_ؒ_*_ؓ_*_ؔ_*_ؕ_*_ؖ_*_ؗ_*_ؘ_*_ؙ_*_ؚ_*_؛_*__*_؝_*_؞_*_؟_*_ؠ_*_ء_*_آ_*_أ_*_ؤ_*_إ_*_ئ_*_ا_*_ب_*_ة_*_ت_*_ث_*_ج_*_ح_*_خ_*_د_*_ذ_*_ر_*_ز_*_س_*_ش_*_ص_*_ض_*_ط_*_ظ_*_ع_*_غ_*_ػ_*_ؼ_*_ؽ_*_ؾ_*_ؿ_*_ـ_*_ف_*_ق_*_ك_*_ل_*_م_*_ن_*_ه_*_و_*_ى_*_ي_*_ً_*_ٌ_*_ٍ_*_َ_*_ُ_*_ِ_*_ّ_*_ْ_*_ٓ_*_ٔ_*_ٕ_*_ٖ_*_ٗ_*_٘_*_ٙ_*_ٚ_*_ٛ_*_ٜ_*_ٝ_*_ٞ_*_ٟ_*_٠_*_١_*_٢_*_٣_*_٤_*_٥_*_٦_*_٧_*_٨_*_٩_*_٪_*_٫_*_٬_*_٭_*_ٮ_*_ٯ_*_ٰ_*_ٱ_*_ٲ_*_ٳ_*_ٴ_*_ٵ_*_ٶ_*_ٷ_*_ٸ_*_ٹ_*_ٺ_*_ٻ_*_ټ_*_ٽ_*_پ_*_ٿ_*_ڀ_*_ځ_*_ڂ_*_ڃ_*_ڄ_*_څ_*_چ_*_ڇ_*_ڈ_*_ډ_*_ڊ_*_ڋ_*_ڌ_*_ڍ_*_ڎ_*_ڏ_*_ڐ_*_ڑ_*_ڒ_*_ړ_*_ڔ_*_ڕ_*_ږ_*_ڗ_*_ژ_*_ڙ_*_ښ_*_ڛ_*_ڜ_*_ڝ_*_ڞ_*_ڟ_*_ڠ_*_ڡ_*_ڢ_*_ڣ_*_ڤ_*_ڥ_*_ڦ_*_ڧ_*_ڨ_*_ک_*_ڪ_*_ګ_*_ڬ_*_ڭ_*_ڮ_*_گ_*_ڰ_*_ڱ_*_ڲ_*_ڳ_*_ڴ_*_ڵ_*_ڶ_*_ڷ_*_ڸ_*_ڹ_*_ں_*_ڻ_*_ڼ_*_ڽ_*_ھ_*_ڿ_*_ۀ_*_ہ_*_ۂ_*_ۃ_*_ۄ_*_ۅ_*_ۆ_*_ۇ_*_ۈ_*_ۉ_*_ۊ_*_ۋ_*_ی_*_ۍ_*_ێ_*_ۏ_*_ې_*_ۑ_*_ے_*_ۓ_*_۔_*_ە_*_ۖ_*_ۗ_*_ۘ_*_ۙ_*_ۚ_*_ۛ_*_ۜ_*__*_۞_*_۟_*_۠_*_ۡ_*_ۢ_*_ۣ_*_ۤ_*_ۥ_*_ۦ_*_ۧ_*_ۨ_*_۩_*_۪_*_۫_*_۬_*_ۭ_*_ۮ_*_ۯ_*_۰_*_۱_*_۲_*_۳_*_۴_*_۵_*_۶_*_۷_*_۸_*_۹_*_ۺ_*_ۻ_*_ۼ_*_۽_*_۾_*_ۿ'
|
7
4
|
|
8
|
-
#
|
9
|
-
|
5
|
+
# Initialize the encoder
|
6
|
+
def initialize
|
7
|
+
# UTF-8 bytes-sequences always begin with one of (0xD8 - 0xDB) for Arabic
|
8
|
+
@utf_8_beginning_chars = [*216..219].map(&:chr).join.force_encoding('cp1252')
|
10
9
|
|
11
|
-
|
12
|
-
|
10
|
+
# Misencoded sequences can be correctly re-encoded to utf-8, EXCEPT for one
|
11
|
+
# charecter with gets replaced with a space (ASCII for it: 32)!
|
12
|
+
@problem_char = 32.chr.force_encoding('cp1252')
|
13
13
|
|
14
|
-
|
14
|
+
# The correct replacement for the problem charecter
|
15
|
+
@correct_char = 160.chr.force_encoding('cp1252')
|
15
16
|
end
|
16
|
-
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
@map.each do |broken, good|
|
24
|
-
|
25
|
-
str = "[#{broken.join(', ')}]"
|
26
|
-
|
27
|
-
justification = str.length if str.length > justification
|
28
|
-
str = str.ljust justification
|
29
|
-
|
30
|
-
str += ' => '
|
31
|
-
str += "[#{good.join(', ')}]"
|
32
|
-
str += " # #{ar[i]}\n"
|
33
|
-
|
34
|
-
mappings += str
|
35
|
-
|
36
|
-
i += 1
|
37
|
-
end
|
38
|
-
|
39
|
-
puts mappings
|
40
|
-
end
|
41
|
-
|
17
|
+
|
18
|
+
# Repairs Arabic (U+0600 - U+06FF) data
|
19
|
+
# which has been misencoded from cp1252 to UTF-8
|
20
|
+
# although the original data was UTF-8 encoded
|
21
|
+
#
|
22
|
+
# @param [String] Misencoded string
|
23
|
+
# @return [String] Correctly encoded utf-8 string
|
42
24
|
def repair(str)
|
43
|
-
|
44
|
-
|
45
|
-
|
46
|
-
|
25
|
+
|
26
|
+
# Data buffers
|
27
|
+
source = str.clone
|
28
|
+
fixed = ""
|
29
|
+
|
30
|
+
# Each string needs a new converter instance
|
31
|
+
ec = Encoding::Converter.new('utf-8', 'cp1252')
|
32
|
+
|
33
|
+
until source.empty?
|
47
34
|
|
48
|
-
#
|
49
|
-
#
|
50
|
-
|
35
|
+
# Don't process correctly UTF-8
|
36
|
+
# encoded Arabic data
|
37
|
+
if is_arabic?(source[0])
|
38
|
+
fixed += source.shift!.force_encoding('cp1252')
|
39
|
+
next
|
40
|
+
end
|
51
41
|
|
52
|
-
|
53
|
-
|
54
|
-
|
42
|
+
state = ec.primitive_convert(source, fixed, nil, nil, Encoding::Converter::AFTER_OUTPUT)
|
43
|
+
|
44
|
+
# When an undefined sequence is found, we only move the
|
45
|
+
# 2nd byte to the fixed data, as it will be valid in UTF-8.
|
46
|
+
# For example: 129 is undefined in cp1252, but it is in UTF-8.
|
47
|
+
# If we get a sequence like:
|
48
|
+
# ec.last_error.error_char.unpack('C*') # => [194, 129]
|
49
|
+
# We just ignore the 194 and add the 129 to the fixed data
|
50
|
+
if state == :undefined_conversion
|
51
|
+
c = ec.last_error.error_char.unpack('C*')[1].chr
|
52
|
+
fixed += c.force_encoding('cp1252')
|
53
|
+
end
|
55
54
|
|
56
|
-
|
55
|
+
# After each byte gets converted, check for the problem charecter
|
56
|
+
# and replace it if it's found
|
57
|
+
if state == :after_output && ends_with_problem?(fixed)
|
58
|
+
fixed.gsub!(/#{@problem_char}$/, @correct_char)
|
59
|
+
end
|
57
60
|
end
|
58
61
|
|
59
|
-
|
60
|
-
@repaired = []
|
61
|
-
result
|
62
|
+
fixed.force_encoding('utf-8')
|
62
63
|
end
|
63
64
|
|
64
65
|
private
|
65
66
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
72
|
-
end
|
73
|
-
end
|
74
|
-
|
75
|
-
def sequence_found_in_map
|
76
|
-
(1..4).each do |i|
|
77
|
-
broken_seq = @broken[0..i]
|
78
|
-
if @map.has_key?(broken_seq)
|
79
|
-
@repaired += @map[broken_seq]
|
80
|
-
@broken.slice!(0, i + 1)
|
81
|
-
return true
|
82
|
-
end
|
83
|
-
end
|
84
|
-
false
|
67
|
+
# Used to test for correctly encoded UTF-8 Arabic
|
68
|
+
#
|
69
|
+
# @param [String] Data to test
|
70
|
+
# @return [Boolean]
|
71
|
+
def is_arabic?(str)
|
72
|
+
str =~ %r{([\u0600-\u06FF])+}u
|
85
73
|
end
|
86
74
|
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
)
|
95
|
-
# Add the byte to the repaired sequence and remove it from the broken one
|
96
|
-
@repaired << @broken.shift
|
97
|
-
end
|
98
|
-
end
|
99
|
-
|
100
|
-
def handle_unknown_byte
|
101
|
-
case
|
102
|
-
# Handle the case when the last 2 bytes are the beginning of
|
103
|
-
# a broken sequence but it's not found in the mappings hash.
|
104
|
-
# The best guess is that they're 2 one-byte chars.
|
105
|
-
#
|
106
|
-
# Handles: [195, (152 | 153 | 154 | 155)]
|
107
|
-
when @broken.length == 2
|
108
|
-
@repaired << @broken.first << @broken[1]
|
109
|
-
@broken.slice!(0, 2)
|
110
|
-
|
111
|
-
# If we are here, then we have no idea what is this byte!
|
112
|
-
# We will just use the replace_char to replace the unknown byte_not_broken
|
113
|
-
# in the repaired sequence
|
114
|
-
else
|
115
|
-
@repaired += @replace_char.bytes.to_a
|
116
|
-
end
|
75
|
+
# Used to check for the problem char in the end
|
76
|
+
# of a given String
|
77
|
+
#
|
78
|
+
# @param [String] Data to test
|
79
|
+
# @return [Boolean]
|
80
|
+
def ends_with_problem?(str)
|
81
|
+
str =~ %r{[#{@utf_8_beginning_chars}]#{@problem_char}$}
|
117
82
|
end
|
118
83
|
end
|
data/lib/mosaheh/version.rb
CHANGED
data/mosaheh.gemspec
CHANGED
@@ -18,5 +18,10 @@ Gem::Specification.new do |s|
|
|
18
18
|
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
19
|
s.require_paths = ["lib"]
|
20
20
|
|
21
|
-
s.add_development_dependency
|
21
|
+
s.add_development_dependency 'rspec'
|
22
|
+
s.add_development_dependency 'guard'
|
23
|
+
s.add_development_dependency 'guard-rspec'
|
24
|
+
# Linux notifications for guard
|
25
|
+
s.add_development_dependency 'rb-inotify'
|
26
|
+
s.add_development_dependency 'libnotify'
|
22
27
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path('../../../../spec_helper', __FILE__)
|
3
|
+
|
4
|
+
describe String, '#shift!' do
|
5
|
+
let(:string) { 'abc' }
|
6
|
+
|
7
|
+
it 'should remove the first charecter' do
|
8
|
+
string.shift!
|
9
|
+
string.should == 'bc'
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should return the removed charecter' do
|
13
|
+
string.shift!.should == 'a'
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path('../../../spec_helper', __FILE__)
|
3
|
+
|
4
|
+
describe Mosaheh::Encoder do
|
5
|
+
let(:encoder) { Mosaheh::Encoder.new }
|
6
|
+
|
7
|
+
describe '#repair' do
|
8
|
+
it 'should repair the whole Arabic unicode codeblock' do
|
9
|
+
# Arabic: U+0060 - U+06FF
|
10
|
+
good_ar = (0..255).map{|i| eval '"\u06' + ("%02x" % i) + '"'}.join ' '
|
11
|
+
broken_ar = load_misencoded_ar
|
12
|
+
encoder.repair(broken_ar).should == good_ar
|
13
|
+
end
|
14
|
+
|
15
|
+
it 'should not change ASCII chars' do
|
16
|
+
text = [*0..127].map(&:chr).join
|
17
|
+
encoder.repair(text).should == text
|
18
|
+
end
|
19
|
+
|
20
|
+
it 'should not change correctly encoded Arabic chars' do
|
21
|
+
text = 'إختبار'
|
22
|
+
encoder.repair(text).should == text
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
@@ -0,0 +1 @@
|
|
1
|
+
Ø€ Ø Ø‚ ؃ Ø„ Ø… ؆ ؇ ؈ ؉ ØŠ Ø‹ ØŒ Ø ØŽ Ø Ø Ø‘ Ø’ Ø“ Ø” Ø• Ø– Ø— ؘ Ø™ Øš Ø› Øœ Ø Øž ØŸ Ø Ø¡ Ø¢ Ø£ ؤ Ø¥ ئ ا ب Ø© ت Ø« ج Ø Ø® د ذ ر ز س Ø´ ص ض Ø· ظ ع غ Ø» ؼ ؽ ؾ Ø¿ Ù€ Ù Ù‚ Ùƒ Ù„ Ù… Ù† Ù‡ Ùˆ Ù‰ ÙŠ Ù‹ ÙŒ Ù ÙŽ Ù Ù Ù‘ Ù’ Ù“ Ù” Ù• Ù– Ù— Ù˜ Ù™ Ùš Ù› Ùœ Ù Ùž ÙŸ Ù Ù¡ Ù¢ Ù£ Ù¤ Ù¥ Ù¦ Ù§ Ù¨ Ù© Ùª Ù« Ù¬ Ù Ù® Ù¯ Ù° Ù± Ù² Ù³ Ù´ Ùµ Ù¶ Ù· Ù¸ Ù¹ Ùº Ù» Ù¼ Ù½ Ù¾ Ù¿ Ú€ Ú Ú‚ Úƒ Ú„ Ú… Ú† Ú‡ Úˆ Ú‰ ÚŠ Ú‹ ÚŒ Ú ÚŽ Ú Ú Ú‘ Ú’ Ú“ Ú” Ú• Ú– Ú— Ú˜ Ú™ Úš Ú› Úœ Ú Úž ÚŸ Ú Ú¡ Ú¢ Ú£ Ú¤ Ú¥ Ú¦ Ú§ Ú¨ Ú© Úª Ú« Ú¬ Ú Ú® Ú¯ Ú° Ú± Ú² Ú³ Ú´ Úµ Ú¶ Ú· Ú¸ Ú¹ Úº Ú» Ú¼ Ú½ Ú¾ Ú¿ Û€ Û Û‚ Ûƒ Û„ Û… Û† Û‡ Ûˆ Û‰ ÛŠ Û‹ ÛŒ Û ÛŽ Û Û Û‘ Û’ Û“ Û” Û• Û– Û— Û˜ Û™ Ûš Û› Ûœ Û Ûž ÛŸ Û Û¡ Û¢ Û£ Û¤ Û¥ Û¦ Û§ Û¨ Û© Ûª Û« Û¬ Û Û® Û¯ Û° Û± Û² Û³ Û´ Ûµ Û¶ Û· Û¸ Û¹ Ûº Û» Û¼ Û½ Û¾ Û¿
|
data/spec/spec_helper.rb
CHANGED
@@ -7,3 +7,11 @@ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
|
|
7
7
|
$LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
|
8
8
|
|
9
9
|
require 'mosaheh'
|
10
|
+
|
11
|
+
def load_misencoded_sample(filename)
|
12
|
+
File.read("#{File.dirname(__FILE__)}/misencoded_samples/#{filename}").chomp
|
13
|
+
end
|
14
|
+
|
15
|
+
def load_misencoded_ar
|
16
|
+
load_misencoded_sample('ar.txt')
|
17
|
+
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: mosaheh
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-
|
12
|
+
date: 2011-11-05 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rspec
|
16
|
-
requirement: &
|
16
|
+
requirement: &18315980 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,7 +21,51 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *18315980
|
25
|
+
- !ruby/object:Gem::Dependency
|
26
|
+
name: guard
|
27
|
+
requirement: &18315480 !ruby/object:Gem::Requirement
|
28
|
+
none: false
|
29
|
+
requirements:
|
30
|
+
- - ! '>='
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
type: :development
|
34
|
+
prerelease: false
|
35
|
+
version_requirements: *18315480
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: guard-rspec
|
38
|
+
requirement: &18314980 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *18314980
|
47
|
+
- !ruby/object:Gem::Dependency
|
48
|
+
name: rb-inotify
|
49
|
+
requirement: &18314480 !ruby/object:Gem::Requirement
|
50
|
+
none: false
|
51
|
+
requirements:
|
52
|
+
- - ! '>='
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '0'
|
55
|
+
type: :development
|
56
|
+
prerelease: false
|
57
|
+
version_requirements: *18314480
|
58
|
+
- !ruby/object:Gem::Dependency
|
59
|
+
name: libnotify
|
60
|
+
requirement: &18313920 !ruby/object:Gem::Requirement
|
61
|
+
none: false
|
62
|
+
requirements:
|
63
|
+
- - ! '>='
|
64
|
+
- !ruby/object:Gem::Version
|
65
|
+
version: '0'
|
66
|
+
type: :development
|
67
|
+
prerelease: false
|
68
|
+
version_requirements: *18313920
|
25
69
|
description: Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly
|
26
70
|
saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to
|
27
71
|
repair Arabic data stored in MySql databases with the wrong encoding.
|
@@ -33,14 +77,19 @@ extra_rdoc_files: []
|
|
33
77
|
files:
|
34
78
|
- .gitignore
|
35
79
|
- Gemfile
|
80
|
+
- Guardfile
|
36
81
|
- README.md
|
37
82
|
- Rakefile
|
38
83
|
- lib/mosaheh.rb
|
84
|
+
- lib/mosaheh/core_ext.rb
|
85
|
+
- lib/mosaheh/core_ext/string.rb
|
39
86
|
- lib/mosaheh/encoder.rb
|
40
87
|
- lib/mosaheh/version.rb
|
41
88
|
- mosaheh.gemspec
|
42
89
|
- spec/.rspec
|
43
|
-
- spec/mosaheh/
|
90
|
+
- spec/lib/mosaheh/core_ext/string_spec.rb
|
91
|
+
- spec/lib/mosaheh/encoder_spec.rb
|
92
|
+
- spec/misencoded_samples/ar.txt
|
44
93
|
- spec/spec_helper.rb
|
45
94
|
homepage: ''
|
46
95
|
licenses: []
|
@@ -1,17 +0,0 @@
|
|
1
|
-
# encoding: UTF-8
|
2
|
-
require File.expand_path('../../spec_helper', __FILE__)
|
3
|
-
|
4
|
-
describe Mosaheh::Encoder do
|
5
|
-
let(:encoder) { Mosaheh::Encoder.new }
|
6
|
-
|
7
|
-
describe '#repair' do
|
8
|
-
it 'should repair the whole Arabic unicode codeblock' do
|
9
|
-
encoder.repair(Mosaheh::Encoder::BROKEN_AR).unpack('C*').should eq Mosaheh::Encoder::AR.unpack('C*')
|
10
|
-
end
|
11
|
-
|
12
|
-
it 'should not change ASCII chars' do
|
13
|
-
text = [*0..127].map(&:chr).join
|
14
|
-
encoder.repair(text).should == text
|
15
|
-
end
|
16
|
-
end
|
17
|
-
end
|