mosaheh 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in mosaheh.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ Mosaheh
2
+ =======
3
+
4
+ Mosaheh, Arabic for corrector, is a ruby lib which can repair UTF-8 Arabic (U+0600 - U+06FF) text that has been mistakenly saved as single-byte latin1 encoding (cp1252).
5
+
6
+ Usage
7
+ -----
8
+
9
+ # encoding: UTF-8
10
+ require 'mosaheh'
11
+
12
+ encoder = Mosaheh::Encoder.new
13
+ encoder.repair 'عربي' # => عربي
14
+
15
+ When to use
16
+ -----------
17
+
18
+ The biggest usecase for it is to repair Arabic data stored in MySql databases with the wrong encoding.
19
+
20
+ Imagine that you have a MySql database which has the correct collation for a utf-8 data, but the MySql driver was sending the data as latin1 (default in PHPs PDO for example).
21
+ In this case Mosaheh will be your savior, especially if you have a lot of invaluable data.
22
+
23
+ LICENSE
24
+ -------
25
+
26
+ (The MIT License)
27
+
28
+ > Copyright (C) 2011 by Maher Sallam
29
+ >
30
+ > Permission is hereby granted, free of charge, to any person obtaining a copy
31
+ > of this software and associated documentation files (the "Software"), to deal
32
+ > in the Software without restriction, including without limitation the rights
33
+ > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34
+ > copies of the Software, and to permit persons to whom the Software is
35
+ > furnished to do so, subject to the following conditions:
36
+ >
37
+ > The above copyright notice and this permission notice shall be included in
38
+ > all copies or substantial portions of the Software.
39
+ >
40
+ > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41
+ > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42
+ > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43
+ > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44
+ > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45
+ > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
46
+ > THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,118 @@
1
+ # encoding: UTF-8
2
+
3
+ class Mosaheh::Encoder
4
+
5
+ # Arabic UTF-8 Block, from U+0600 to U+06FF seperated using '_*_'
6
+ AR = '؀_*_؁_*_؂_*_؃_*_؄_*_؅_*_؆_*_؇_*_؈_*_؉_*_؊_*_؋_*_،_*_؍_*_؎_*_؏_*_ؐ_*_ؑ_*_ؒ_*_ؓ_*_ؔ_*_ؕ_*_ؖ_*_ؗ_*_ؘ_*_ؙ_*_ؚ_*_؛_*_؜_*_؝_*_؞_*_؟_*_ؠ_*_ء_*_آ_*_أ_*_ؤ_*_إ_*_ئ_*_ا_*_ب_*_ة_*_ت_*_ث_*_ج_*_ح_*_خ_*_د_*_ذ_*_ر_*_ز_*_س_*_ش_*_ص_*_ض_*_ط_*_ظ_*_ع_*_غ_*_ػ_*_ؼ_*_ؽ_*_ؾ_*_ؿ_*_ـ_*_ف_*_ق_*_ك_*_ل_*_م_*_ن_*_ه_*_و_*_ى_*_ي_*_ً_*_ٌ_*_ٍ_*_َ_*_ُ_*_ِ_*_ّ_*_ْ_*_ٓ_*_ٔ_*_ٕ_*_ٖ_*_ٗ_*_٘_*_ٙ_*_ٚ_*_ٛ_*_ٜ_*_ٝ_*_ٞ_*_ٟ_*_٠_*_١_*_٢_*_٣_*_٤_*_٥_*_٦_*_٧_*_٨_*_٩_*_٪_*_٫_*_٬_*_٭_*_ٮ_*_ٯ_*_ٰ_*_ٱ_*_ٲ_*_ٳ_*_ٴ_*_ٵ_*_ٶ_*_ٷ_*_ٸ_*_ٹ_*_ٺ_*_ٻ_*_ټ_*_ٽ_*_پ_*_ٿ_*_ڀ_*_ځ_*_ڂ_*_ڃ_*_ڄ_*_څ_*_چ_*_ڇ_*_ڈ_*_ډ_*_ڊ_*_ڋ_*_ڌ_*_ڍ_*_ڎ_*_ڏ_*_ڐ_*_ڑ_*_ڒ_*_ړ_*_ڔ_*_ڕ_*_ږ_*_ڗ_*_ژ_*_ڙ_*_ښ_*_ڛ_*_ڜ_*_ڝ_*_ڞ_*_ڟ_*_ڠ_*_ڡ_*_ڢ_*_ڣ_*_ڤ_*_ڥ_*_ڦ_*_ڧ_*_ڨ_*_ک_*_ڪ_*_ګ_*_ڬ_*_ڭ_*_ڮ_*_گ_*_ڰ_*_ڱ_*_ڲ_*_ڳ_*_ڴ_*_ڵ_*_ڶ_*_ڷ_*_ڸ_*_ڹ_*_ں_*_ڻ_*_ڼ_*_ڽ_*_ھ_*_ڿ_*_ۀ_*_ہ_*_ۂ_*_ۃ_*_ۄ_*_ۅ_*_ۆ_*_ۇ_*_ۈ_*_ۉ_*_ۊ_*_ۋ_*_ی_*_ۍ_*_ێ_*_ۏ_*_ې_*_ۑ_*_ے_*_ۓ_*_۔_*_ە_*_ۖ_*_ۗ_*_ۘ_*_ۙ_*_ۚ_*_ۛ_*_ۜ_*_۝_*_۞_*_۟_*_۠_*_ۡ_*_ۢ_*_ۣ_*_ۤ_*_ۥ_*_ۦ_*_ۧ_*_ۨ_*_۩_*_۪_*_۫_*_۬_*_ۭ_*_ۮ_*_ۯ_*_۰_*_۱_*_۲_*_۳_*_۴_*_۵_*_۶_*_۷_*_۸_*_۹_*_ۺ_*_ۻ_*_ۼ_*_۽_*_۾_*_ۿ'
7
+
8
+ # U+0600 to U+06FF encoded using cp1252
9
+ BROKEN_AR = 'Ø€_*_؁_*_Ø‚_*_؃_*_Ø„_*_Ø…_*_؆_*_؇_*_؈_*_؉_*_ØŠ_*_Ø‹_*_ØŒ_*_؍_*_ØŽ_*_؏_*_ؐ_*_Ø‘_*_Ø’_*_Ø“_*_Ø”_*_Ø•_*_Ø–_*_Ø—_*_ؘ_*_Ø™_*_Øš_*_Ø›_*_Øœ_*_؝_*_Øž_*_ØŸ_*_Ø _*_Ø¡_*_Ø¢_*_Ø£_*_ؤ_*_Ø¥_*_ئ_*_ا_*_ب_*_Ø©_*_ت_*_Ø«_*_ج_*_Ø­_*_Ø®_*_د_*_Ø°_*_ر_*_ز_*_س_*_Ø´_*_ص_*_ض_*_Ø·_*_ظ_*_ع_*_غ_*_Ø»_*_ؼ_*_ؽ_*_ؾ_*_Ø¿_*_Ù€_*_ف_*_Ù‚_*_Ùƒ_*_Ù„_*_Ù…_*_Ù†_*_Ù‡_*_Ùˆ_*_Ù‰_*_ÙŠ_*_Ù‹_*_ÙŒ_*_ٍ_*_ÙŽ_*_ُ_*_ِ_*_Ù‘_*_Ù’_*_Ù“_*_Ù”_*_Ù•_*_Ù–_*_Ù—_*_Ù˜_*_Ù™_*_Ùš_*_Ù›_*_Ùœ_*_ٝ_*_Ùž_*_ÙŸ_*_Ù _*_Ù¡_*_Ù¢_*_Ù£_*_Ù¤_*_Ù¥_*_Ù¦_*_Ù§_*_Ù¨_*_Ù©_*_Ùª_*_Ù«_*_Ù¬_*_Ù­_*_Ù®_*_Ù¯_*_Ù°_*_Ù±_*_Ù²_*_Ù³_*_Ù´_*_Ùµ_*_Ù¶_*_Ù·_*_Ù¸_*_Ù¹_*_Ùº_*_Ù»_*_Ù¼_*_Ù½_*_Ù¾_*_Ù¿_*_Ú€_*_ځ_*_Ú‚_*_Úƒ_*_Ú„_*_Ú…_*_Ú†_*_Ú‡_*_Úˆ_*_Ú‰_*_ÚŠ_*_Ú‹_*_ÚŒ_*_ڍ_*_ÚŽ_*_ڏ_*_ڐ_*_Ú‘_*_Ú’_*_Ú“_*_Ú”_*_Ú•_*_Ú–_*_Ú—_*_Ú˜_*_Ú™_*_Úš_*_Ú›_*_Úœ_*_ڝ_*_Úž_*_ÚŸ_*_Ú _*_Ú¡_*_Ú¢_*_Ú£_*_Ú¤_*_Ú¥_*_Ú¦_*_Ú§_*_Ú¨_*_Ú©_*_Úª_*_Ú«_*_Ú¬_*_Ú­_*_Ú®_*_Ú¯_*_Ú°_*_Ú±_*_Ú²_*_Ú³_*_Ú´_*_Úµ_*_Ú¶_*_Ú·_*_Ú¸_*_Ú¹_*_Úº_*_Ú»_*_Ú¼_*_Ú½_*_Ú¾_*_Ú¿_*_Û€_*_ہ_*_Û‚_*_Ûƒ_*_Û„_*_Û…_*_Û†_*_Û‡_*_Ûˆ_*_Û‰_*_ÛŠ_*_Û‹_*_ÛŒ_*_ۍ_*_ÛŽ_*_ۏ_*_ې_*_Û‘_*_Û’_*_Û“_*_Û”_*_Û•_*_Û–_*_Û—_*_Û˜_*_Û™_*_Ûš_*_Û›_*_Ûœ_*_۝_*_Ûž_*_ÛŸ_*_Û _*_Û¡_*_Û¢_*_Û£_*_Û¤_*_Û¥_*_Û¦_*_Û§_*_Û¨_*_Û©_*_Ûª_*_Û«_*_Û¬_*_Û­_*_Û®_*_Û¯_*_Û°_*_Û±_*_Û²_*_Û³_*_Û´_*_Ûµ_*_Û¶_*_Û·_*_Û¸_*_Û¹_*_Ûº_*_Û»_*_Û¼_*_Û½_*_Û¾_*_Û¿'
10
+
11
+ def initialize options = {}
12
+ @replace_char = options[:replace_char] || '?'
13
+
14
+ generate_mappings_hash
15
+ end
16
+
17
+ def show_mappings_hash
18
+ i = 0
19
+ justification = 0;
20
+ mappings = ''
21
+ ar = AR.split('_*_')
22
+
23
+ @map.each do |broken, good|
24
+
25
+ str = "[#{broken.join(', ')}]"
26
+
27
+ justification = str.length if str.length > justification
28
+ str = str.ljust justification
29
+
30
+ str += ' => '
31
+ str += "[#{good.join(', ')}]"
32
+ str += " # #{ar[i]}\n"
33
+
34
+ mappings += str
35
+
36
+ i += 1
37
+ end
38
+
39
+ puts mappings
40
+ end
41
+
42
+ def repair(str)
43
+ @broken = str.unpack('C*')
44
+ @repaired = []
45
+
46
+ while @broken.length > 0
47
+
48
+ # Try to use the mappings hash first and jump to the
49
+ # next sequence if we succeed
50
+ next if sequence_found_in_map
51
+
52
+ # Try to handle ASCII chars if they are not a part
53
+ # of a broken sequence
54
+ next if byte_not_broken
55
+
56
+ handle_unknown_byte
57
+ end
58
+
59
+ result = @repaired.pack('C*')
60
+ @repaired = []
61
+ result
62
+ end
63
+
64
+ private
65
+
66
+ def generate_mappings_hash
67
+ @map = {}
68
+ ar = AR.split('_*_')
69
+
70
+ BROKEN_AR.split('_*_').each_with_index do |c, i|
71
+ @map[ c.unpack('C*') ] = ar[i].unpack('C*')
72
+ end
73
+ end
74
+
75
+ def sequence_found_in_map
76
+ (1..4).each do |i|
77
+ broken_seq = @broken[0..i]
78
+ if @map.has_key?(broken_seq)
79
+ @repaired += @map[broken_seq]
80
+ @broken.slice!(0, i + 1)
81
+ return true
82
+ end
83
+ end
84
+ false
85
+ end
86
+
87
+ def byte_not_broken
88
+ if (
89
+ @broken.first != 195 && # The byte is not the beginning of a broken sequence
90
+ @broken.first < 256 # One byte char
91
+ ) || (
92
+ @broken.first == 195 && # The byte is the beginning of a sequence ...
93
+ !(152..155).include?(@broken[1]) # ... but the next one is not, so it's not a sequence!
94
+ )
95
+ # Add the byte to the repaired sequence and remove it from the broken one
96
+ @repaired << @broken.shift
97
+ end
98
+ end
99
+
100
+ def handle_unknown_byte
101
+ case
102
+ # Handle the case when the last 2 bytes are the beginning of
103
+ # a broken sequence but it's not found in the mappings hash.
104
+ # The best guess is that they're 2 one-byte chars.
105
+ #
106
+ # Handles: [195, (152 | 153 | 154 | 155)]
107
+ when @broken.length == 2
108
+ @repaired << @broken.first << @broken[1]
109
+ @broken.slice!(0, 2)
110
+
111
+ # If we are here, then we have no idea what is this byte!
112
+ # We will just use the replace_char to replace the unknown byte_not_broken
113
+ # in the repaired sequence
114
+ else
115
+ @repaired += @replace_char.bytes.to_a
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,4 @@
1
+ # encoding: UTF-8
2
+ module Mosaheh
3
+ VERSION = "0.0.1"
4
+ end
data/lib/mosaheh.rb ADDED
@@ -0,0 +1,6 @@
1
+ # encoding: UTF-8
2
+ require "mosaheh/version"
3
+
4
+ module Mosaheh
5
+ autoload :Encoder, 'mosaheh/encoder'
6
+ end
data/mosaheh.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "mosaheh/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "mosaheh"
7
+ s.version = Mosaheh::VERSION
8
+ s.authors = ["Maher Sallam"]
9
+ s.email = ["maher@sallam.me"]
10
+ s.homepage = ""
11
+ s.summary = %q{An Arabic text re-encoder from latin1 (cp1252) to Arabic (UTF-8).}
12
+ s.description = %q{Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to repair Arabic data stored in MySql databases with the wrong encoding.}
13
+
14
+ s.rubyforge_project = "mosaheh"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rspec"
22
+ end
data/spec/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,17 @@
1
+ # encoding: UTF-8
2
+ require File.expand_path('../../spec_helper', __FILE__)
3
+
4
+ describe Mosaheh::Encoder do
5
+ let(:encoder) { Mosaheh::Encoder.new }
6
+
7
+ describe '#repair' do
8
+ it 'should repair the whole Arabic unicode codeblock' do
9
+ encoder.repair(Mosaheh::Encoder::BROKEN_AR).unpack('C*').should eq Mosaheh::Encoder::AR.unpack('C*')
10
+ end
11
+
12
+ it 'should not change ASCII chars' do
13
+ text = [*0..127].map(&:chr).join
14
+ encoder.repair(text).should == text
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'rubygems'
4
+ require 'rspec'
5
+
6
+ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
7
+ $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
8
+
9
+ require 'mosaheh'
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mosaheh
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Maher Sallam
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-10-31 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &21735460 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *21735460
25
+ description: Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly
26
+ saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to
27
+ repair Arabic data stored in MySql databases with the wrong encoding.
28
+ email:
29
+ - maher@sallam.me
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - .gitignore
35
+ - Gemfile
36
+ - README.md
37
+ - Rakefile
38
+ - lib/mosaheh.rb
39
+ - lib/mosaheh/encoder.rb
40
+ - lib/mosaheh/version.rb
41
+ - mosaheh.gemspec
42
+ - spec/.rspec
43
+ - spec/mosaheh/encoder_spec.rb
44
+ - spec/spec_helper.rb
45
+ homepage: ''
46
+ licenses: []
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ requirements: []
64
+ rubyforge_project: mosaheh
65
+ rubygems_version: 1.8.10
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: An Arabic text re-encoder from latin1 (cp1252) to Arabic (UTF-8).
69
+ test_files: []
70
+ has_rdoc: