mosaheh 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.md +46 -0
- data/Rakefile +1 -0
- data/lib/mosaheh/encoder.rb +118 -0
- data/lib/mosaheh/version.rb +4 -0
- data/lib/mosaheh.rb +6 -0
- data/mosaheh.gemspec +22 -0
- data/spec/.rspec +1 -0
- data/spec/mosaheh/encoder_spec.rb +17 -0
- data/spec/spec_helper.rb +9 -0
- metadata +70 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,46 @@
|
|
1
|
+
Mosaheh
|
2
|
+
=======
|
3
|
+
|
4
|
+
Mosaheh, Arabic for corrector, is a ruby lib which can repair UTF-8 Arabic (U+0600 - U+06FF) text that has been mistakenly saved as single-byte latin1 encoding (cp1252).
|
5
|
+
|
6
|
+
Usage
|
7
|
+
-----
|
8
|
+
|
9
|
+
# encoding: UTF-8
|
10
|
+
require 'mosaheh'
|
11
|
+
|
12
|
+
encoder = Mosaheh::Encoder.new
|
13
|
+
encoder.repair 'عربي' # => عربي
|
14
|
+
|
15
|
+
When to use
|
16
|
+
-----------
|
17
|
+
|
18
|
+
The biggest usecase for it is to repair Arabic data stored in MySql databases with the wrong encoding.
|
19
|
+
|
20
|
+
Imagine that you have a MySql database which has the correct collation for a utf-8 data, but the MySql driver was sending the data as latin1 (default in PHPs PDO for example).
|
21
|
+
In this case Mosaheh will be your savior, especially if you have a lot of invaluable data.
|
22
|
+
|
23
|
+
LICENSE
|
24
|
+
-------
|
25
|
+
|
26
|
+
(The MIT License)
|
27
|
+
|
28
|
+
> Copyright (C) 2011 by Maher Sallam
|
29
|
+
>
|
30
|
+
> Permission is hereby granted, free of charge, to any person obtaining a copy
|
31
|
+
> of this software and associated documentation files (the "Software"), to deal
|
32
|
+
> in the Software without restriction, including without limitation the rights
|
33
|
+
> to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
34
|
+
> copies of the Software, and to permit persons to whom the Software is
|
35
|
+
> furnished to do so, subject to the following conditions:
|
36
|
+
>
|
37
|
+
> The above copyright notice and this permission notice shall be included in
|
38
|
+
> all copies or substantial portions of the Software.
|
39
|
+
>
|
40
|
+
> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
41
|
+
> IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
42
|
+
> FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
43
|
+
> AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
44
|
+
> LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
45
|
+
> OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
46
|
+
> THE SOFTWARE.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,118 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
|
3
|
+
class Mosaheh::Encoder
|
4
|
+
|
5
|
+
# Arabic UTF-8 Block, from U+0600 to U+06FF seperated using '_*_'
|
6
|
+
AR = '_*__*__*__*__*__*_؆_*_؇_*_؈_*_؉_*_؊_*_؋_*_،_*_؍_*_؎_*_؏_*_ؐ_*_ؑ_*_ؒ_*_ؓ_*_ؔ_*_ؕ_*_ؖ_*_ؗ_*_ؘ_*_ؙ_*_ؚ_*_؛_*__*_؝_*_؞_*_؟_*_ؠ_*_ء_*_آ_*_أ_*_ؤ_*_إ_*_ئ_*_ا_*_ب_*_ة_*_ت_*_ث_*_ج_*_ح_*_خ_*_د_*_ذ_*_ر_*_ز_*_س_*_ش_*_ص_*_ض_*_ط_*_ظ_*_ع_*_غ_*_ػ_*_ؼ_*_ؽ_*_ؾ_*_ؿ_*_ـ_*_ف_*_ق_*_ك_*_ل_*_م_*_ن_*_ه_*_و_*_ى_*_ي_*_ً_*_ٌ_*_ٍ_*_َ_*_ُ_*_ِ_*_ّ_*_ْ_*_ٓ_*_ٔ_*_ٕ_*_ٖ_*_ٗ_*_٘_*_ٙ_*_ٚ_*_ٛ_*_ٜ_*_ٝ_*_ٞ_*_ٟ_*_٠_*_١_*_٢_*_٣_*_٤_*_٥_*_٦_*_٧_*_٨_*_٩_*_٪_*_٫_*_٬_*_٭_*_ٮ_*_ٯ_*_ٰ_*_ٱ_*_ٲ_*_ٳ_*_ٴ_*_ٵ_*_ٶ_*_ٷ_*_ٸ_*_ٹ_*_ٺ_*_ٻ_*_ټ_*_ٽ_*_پ_*_ٿ_*_ڀ_*_ځ_*_ڂ_*_ڃ_*_ڄ_*_څ_*_چ_*_ڇ_*_ڈ_*_ډ_*_ڊ_*_ڋ_*_ڌ_*_ڍ_*_ڎ_*_ڏ_*_ڐ_*_ڑ_*_ڒ_*_ړ_*_ڔ_*_ڕ_*_ږ_*_ڗ_*_ژ_*_ڙ_*_ښ_*_ڛ_*_ڜ_*_ڝ_*_ڞ_*_ڟ_*_ڠ_*_ڡ_*_ڢ_*_ڣ_*_ڤ_*_ڥ_*_ڦ_*_ڧ_*_ڨ_*_ک_*_ڪ_*_ګ_*_ڬ_*_ڭ_*_ڮ_*_گ_*_ڰ_*_ڱ_*_ڲ_*_ڳ_*_ڴ_*_ڵ_*_ڶ_*_ڷ_*_ڸ_*_ڹ_*_ں_*_ڻ_*_ڼ_*_ڽ_*_ھ_*_ڿ_*_ۀ_*_ہ_*_ۂ_*_ۃ_*_ۄ_*_ۅ_*_ۆ_*_ۇ_*_ۈ_*_ۉ_*_ۊ_*_ۋ_*_ی_*_ۍ_*_ێ_*_ۏ_*_ې_*_ۑ_*_ے_*_ۓ_*_۔_*_ە_*_ۖ_*_ۗ_*_ۘ_*_ۙ_*_ۚ_*_ۛ_*_ۜ_*__*_۞_*_۟_*_۠_*_ۡ_*_ۢ_*_ۣ_*_ۤ_*_ۥ_*_ۦ_*_ۧ_*_ۨ_*_۩_*_۪_*_۫_*_۬_*_ۭ_*_ۮ_*_ۯ_*_۰_*_۱_*_۲_*_۳_*_۴_*_۵_*_۶_*_۷_*_۸_*_۹_*_ۺ_*_ۻ_*_ۼ_*_۽_*_۾_*_ۿ'
|
7
|
+
|
8
|
+
# U+0600 to U+06FF encoded using cp1252
|
9
|
+
BROKEN_AR = 'Ø€_*_Ø_*_Ø‚_*_؃_*_Ø„_*_Ø…_*_؆_*_؇_*_؈_*_؉_*_ØŠ_*_Ø‹_*_ØŒ_*_Ø_*_ØŽ_*_Ø_*_Ø_*_Ø‘_*_Ø’_*_Ø“_*_Ø”_*_Ø•_*_Ø–_*_Ø—_*_ؘ_*_Ø™_*_Øš_*_Ø›_*_Øœ_*_Ø_*_Øž_*_ØŸ_*_Ø _*_Ø¡_*_Ø¢_*_Ø£_*_ؤ_*_Ø¥_*_ئ_*_ا_*_ب_*_Ø©_*_ت_*_Ø«_*_ج_*_Ø_*_Ø®_*_د_*_Ø°_*_ر_*_ز_*_س_*_Ø´_*_ص_*_ض_*_Ø·_*_ظ_*_ع_*_غ_*_Ø»_*_ؼ_*_ؽ_*_ؾ_*_Ø¿_*_Ù€_*_Ù_*_Ù‚_*_Ùƒ_*_Ù„_*_Ù…_*_Ù†_*_Ù‡_*_Ùˆ_*_Ù‰_*_ÙŠ_*_Ù‹_*_ÙŒ_*_Ù_*_ÙŽ_*_Ù_*_Ù_*_Ù‘_*_Ù’_*_Ù“_*_Ù”_*_Ù•_*_Ù–_*_Ù—_*_Ù˜_*_Ù™_*_Ùš_*_Ù›_*_Ùœ_*_Ù_*_Ùž_*_ÙŸ_*_Ù _*_Ù¡_*_Ù¢_*_Ù£_*_Ù¤_*_Ù¥_*_Ù¦_*_Ù§_*_Ù¨_*_Ù©_*_Ùª_*_Ù«_*_Ù¬_*_Ù_*_Ù®_*_Ù¯_*_Ù°_*_Ù±_*_Ù²_*_Ù³_*_Ù´_*_Ùµ_*_Ù¶_*_Ù·_*_Ù¸_*_Ù¹_*_Ùº_*_Ù»_*_Ù¼_*_Ù½_*_Ù¾_*_Ù¿_*_Ú€_*_Ú_*_Ú‚_*_Úƒ_*_Ú„_*_Ú…_*_Ú†_*_Ú‡_*_Úˆ_*_Ú‰_*_ÚŠ_*_Ú‹_*_ÚŒ_*_Ú_*_ÚŽ_*_Ú_*_Ú_*_Ú‘_*_Ú’_*_Ú“_*_Ú”_*_Ú•_*_Ú–_*_Ú—_*_Ú˜_*_Ú™_*_Úš_*_Ú›_*_Úœ_*_Ú_*_Úž_*_ÚŸ_*_Ú _*_Ú¡_*_Ú¢_*_Ú£_*_Ú¤_*_Ú¥_*_Ú¦_*_Ú§_*_Ú¨_*_Ú©_*_Úª_*_Ú«_*_Ú¬_*_Ú_*_Ú®_*_Ú¯_*_Ú°_*_Ú±_*_Ú²_*_Ú³_*_Ú´_*_Úµ_*_Ú¶_*_Ú·_*_Ú¸_*_Ú¹_*_Úº_*_Ú»_*_Ú¼_*_Ú½_*_Ú¾_*_Ú¿_*_Û€_*_Û_*_Û‚_*_Ûƒ_*_Û„_*_Û…_*_Û†_*_Û‡_*_Ûˆ_*_Û‰_*_ÛŠ_*_Û‹_*_ÛŒ_*_Û_*_ÛŽ_*_Û_*_Û_*_Û‘_*_Û’_*_Û“_*_Û”_*_Û•_*_Û–_*_Û—_*_Û˜_*_Û™_*_Ûš_*_Û›_*_Ûœ_*_Û_*_Ûž_*_ÛŸ_*_Û _*_Û¡_*_Û¢_*_Û£_*_Û¤_*_Û¥_*_Û¦_*_Û§_*_Û¨_*_Û©_*_Ûª_*_Û«_*_Û¬_*_Û_*_Û®_*_Û¯_*_Û°_*_Û±_*_Û²_*_Û³_*_Û´_*_Ûµ_*_Û¶_*_Û·_*_Û¸_*_Û¹_*_Ûº_*_Û»_*_Û¼_*_Û½_*_Û¾_*_Û¿'
|
10
|
+
|
11
|
+
def initialize options = {}
|
12
|
+
@replace_char = options[:replace_char] || '?'
|
13
|
+
|
14
|
+
generate_mappings_hash
|
15
|
+
end
|
16
|
+
|
17
|
+
def show_mappings_hash
|
18
|
+
i = 0
|
19
|
+
justification = 0;
|
20
|
+
mappings = ''
|
21
|
+
ar = AR.split('_*_')
|
22
|
+
|
23
|
+
@map.each do |broken, good|
|
24
|
+
|
25
|
+
str = "[#{broken.join(', ')}]"
|
26
|
+
|
27
|
+
justification = str.length if str.length > justification
|
28
|
+
str = str.ljust justification
|
29
|
+
|
30
|
+
str += ' => '
|
31
|
+
str += "[#{good.join(', ')}]"
|
32
|
+
str += " # #{ar[i]}\n"
|
33
|
+
|
34
|
+
mappings += str
|
35
|
+
|
36
|
+
i += 1
|
37
|
+
end
|
38
|
+
|
39
|
+
puts mappings
|
40
|
+
end
|
41
|
+
|
42
|
+
def repair(str)
|
43
|
+
@broken = str.unpack('C*')
|
44
|
+
@repaired = []
|
45
|
+
|
46
|
+
while @broken.length > 0
|
47
|
+
|
48
|
+
# Try to use the mappings hash first and jump to the
|
49
|
+
# next sequence if we succeed
|
50
|
+
next if sequence_found_in_map
|
51
|
+
|
52
|
+
# Try to handle ASCII chars if they are not a part
|
53
|
+
# of a broken sequence
|
54
|
+
next if byte_not_broken
|
55
|
+
|
56
|
+
handle_unknown_byte
|
57
|
+
end
|
58
|
+
|
59
|
+
result = @repaired.pack('C*')
|
60
|
+
@repaired = []
|
61
|
+
result
|
62
|
+
end
|
63
|
+
|
64
|
+
private
|
65
|
+
|
66
|
+
def generate_mappings_hash
|
67
|
+
@map = {}
|
68
|
+
ar = AR.split('_*_')
|
69
|
+
|
70
|
+
BROKEN_AR.split('_*_').each_with_index do |c, i|
|
71
|
+
@map[ c.unpack('C*') ] = ar[i].unpack('C*')
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
def sequence_found_in_map
|
76
|
+
(1..4).each do |i|
|
77
|
+
broken_seq = @broken[0..i]
|
78
|
+
if @map.has_key?(broken_seq)
|
79
|
+
@repaired += @map[broken_seq]
|
80
|
+
@broken.slice!(0, i + 1)
|
81
|
+
return true
|
82
|
+
end
|
83
|
+
end
|
84
|
+
false
|
85
|
+
end
|
86
|
+
|
87
|
+
def byte_not_broken
|
88
|
+
if (
|
89
|
+
@broken.first != 195 && # The byte is not the beginning of a broken sequence
|
90
|
+
@broken.first < 256 # One byte char
|
91
|
+
) || (
|
92
|
+
@broken.first == 195 && # The byte is the beginning of a sequence ...
|
93
|
+
!(152..155).include?(@broken[1]) # ... but the next one is not, so it's not a sequence!
|
94
|
+
)
|
95
|
+
# Add the byte to the repaired sequence and remove it from the broken one
|
96
|
+
@repaired << @broken.shift
|
97
|
+
end
|
98
|
+
end
|
99
|
+
|
100
|
+
def handle_unknown_byte
|
101
|
+
case
|
102
|
+
# Handle the case when the last 2 bytes are the beginning of
|
103
|
+
# a broken sequence but it's not found in the mappings hash.
|
104
|
+
# The best guess is that they're 2 one-byte chars.
|
105
|
+
#
|
106
|
+
# Handles: [195, (152 | 153 | 154 | 155)]
|
107
|
+
when @broken.length == 2
|
108
|
+
@repaired << @broken.first << @broken[1]
|
109
|
+
@broken.slice!(0, 2)
|
110
|
+
|
111
|
+
# If we are here, then we have no idea what is this byte!
|
112
|
+
# We will just use the replace_char to replace the unknown byte_not_broken
|
113
|
+
# in the repaired sequence
|
114
|
+
else
|
115
|
+
@repaired += @replace_char.bytes.to_a
|
116
|
+
end
|
117
|
+
end
|
118
|
+
end
|
data/lib/mosaheh.rb
ADDED
data/mosaheh.gemspec
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "mosaheh/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "mosaheh"
|
7
|
+
s.version = Mosaheh::VERSION
|
8
|
+
s.authors = ["Maher Sallam"]
|
9
|
+
s.email = ["maher@sallam.me"]
|
10
|
+
s.homepage = ""
|
11
|
+
s.summary = %q{An Arabic text re-encoder from latin1 (cp1252) to Arabic (UTF-8).}
|
12
|
+
s.description = %q{Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to repair Arabic data stored in MySql databases with the wrong encoding.}
|
13
|
+
|
14
|
+
s.rubyforge_project = "mosaheh"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
s.add_development_dependency "rspec"
|
22
|
+
end
|
data/spec/.rspec
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
--color
|
@@ -0,0 +1,17 @@
|
|
1
|
+
# encoding: UTF-8
|
2
|
+
require File.expand_path('../../spec_helper', __FILE__)
|
3
|
+
|
4
|
+
describe Mosaheh::Encoder do
|
5
|
+
let(:encoder) { Mosaheh::Encoder.new }
|
6
|
+
|
7
|
+
describe '#repair' do
|
8
|
+
it 'should repair the whole Arabic unicode codeblock' do
|
9
|
+
encoder.repair(Mosaheh::Encoder::BROKEN_AR).unpack('C*').should eq Mosaheh::Encoder::AR.unpack('C*')
|
10
|
+
end
|
11
|
+
|
12
|
+
it 'should not change ASCII chars' do
|
13
|
+
text = [*0..127].map(&:chr).join
|
14
|
+
encoder.repair(text).should == text
|
15
|
+
end
|
16
|
+
end
|
17
|
+
end
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,70 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: mosaheh
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
prerelease:
|
6
|
+
platform: ruby
|
7
|
+
authors:
|
8
|
+
- Maher Sallam
|
9
|
+
autorequire:
|
10
|
+
bindir: bin
|
11
|
+
cert_chain: []
|
12
|
+
date: 2011-10-31 00:00:00.000000000Z
|
13
|
+
dependencies:
|
14
|
+
- !ruby/object:Gem::Dependency
|
15
|
+
name: rspec
|
16
|
+
requirement: &21735460 !ruby/object:Gem::Requirement
|
17
|
+
none: false
|
18
|
+
requirements:
|
19
|
+
- - ! '>='
|
20
|
+
- !ruby/object:Gem::Version
|
21
|
+
version: '0'
|
22
|
+
type: :development
|
23
|
+
prerelease: false
|
24
|
+
version_requirements: *21735460
|
25
|
+
description: Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly
|
26
|
+
saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to
|
27
|
+
repair Arabic data stored in MySql databases with the wrong encoding.
|
28
|
+
email:
|
29
|
+
- maher@sallam.me
|
30
|
+
executables: []
|
31
|
+
extensions: []
|
32
|
+
extra_rdoc_files: []
|
33
|
+
files:
|
34
|
+
- .gitignore
|
35
|
+
- Gemfile
|
36
|
+
- README.md
|
37
|
+
- Rakefile
|
38
|
+
- lib/mosaheh.rb
|
39
|
+
- lib/mosaheh/encoder.rb
|
40
|
+
- lib/mosaheh/version.rb
|
41
|
+
- mosaheh.gemspec
|
42
|
+
- spec/.rspec
|
43
|
+
- spec/mosaheh/encoder_spec.rb
|
44
|
+
- spec/spec_helper.rb
|
45
|
+
homepage: ''
|
46
|
+
licenses: []
|
47
|
+
post_install_message:
|
48
|
+
rdoc_options: []
|
49
|
+
require_paths:
|
50
|
+
- lib
|
51
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
52
|
+
none: false
|
53
|
+
requirements:
|
54
|
+
- - ! '>='
|
55
|
+
- !ruby/object:Gem::Version
|
56
|
+
version: '0'
|
57
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
58
|
+
none: false
|
59
|
+
requirements:
|
60
|
+
- - ! '>='
|
61
|
+
- !ruby/object:Gem::Version
|
62
|
+
version: '0'
|
63
|
+
requirements: []
|
64
|
+
rubyforge_project: mosaheh
|
65
|
+
rubygems_version: 1.8.10
|
66
|
+
signing_key:
|
67
|
+
specification_version: 3
|
68
|
+
summary: An Arabic text re-encoder from latin1 (cp1252) to Arabic (UTF-8).
|
69
|
+
test_files: []
|
70
|
+
has_rdoc:
|