mosaheh 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in mosaheh.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,46 @@
1
+ Mosaheh
2
+ =======
3
+
4
+ Mosaheh, Arabic for corrector, is a ruby lib which can repair UTF-8 Arabic (U+0600 - U+06FF) text that has been mistakenly saved as single-byte latin1 encoding (cp1252).
5
+
6
+ Usage
7
+ -----
8
+
9
+ # encoding: UTF-8
10
+ require 'mosaheh'
11
+
12
+ encoder = Mosaheh::Encoder.new
13
+ encoder.repair 'عربي' # => عربي
14
+
15
+ When to use
16
+ -----------
17
+
18
+ The biggest usecase for it is to repair Arabic data stored in MySql databases with the wrong encoding.
19
+
20
+ Imagine that you have a MySql database which has the correct collation for a utf-8 data, but the MySql driver was sending the data as latin1 (default in PHPs PDO for example).
21
+ In this case Mosaheh will be your savior, especially if you have a lot of invaluable data.
22
+
23
+ LICENSE
24
+ -------
25
+
26
+ (The MIT License)
27
+
28
+ > Copyright (C) 2011 by Maher Sallam
29
+ >
30
+ > Permission is hereby granted, free of charge, to any person obtaining a copy
31
+ > of this software and associated documentation files (the "Software"), to deal
32
+ > in the Software without restriction, including without limitation the rights
33
+ > to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
34
+ > copies of the Software, and to permit persons to whom the Software is
35
+ > furnished to do so, subject to the following conditions:
36
+ >
37
+ > The above copyright notice and this permission notice shall be included in
38
+ > all copies or substantial portions of the Software.
39
+ >
40
+ > THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
41
+ > IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
42
+ > FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
43
+ > AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
44
+ > LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
45
+ > OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
46
+ > THE SOFTWARE.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,118 @@
1
+ # encoding: UTF-8
2
+
3
+ class Mosaheh::Encoder
4
+
5
+ # Arabic UTF-8 Block, from U+0600 to U+06FF seperated using '_*_'
6
+ AR = '؀_*_؁_*_؂_*_؃_*_؄_*_؅_*_؆_*_؇_*_؈_*_؉_*_؊_*_؋_*_،_*_؍_*_؎_*_؏_*_ؐ_*_ؑ_*_ؒ_*_ؓ_*_ؔ_*_ؕ_*_ؖ_*_ؗ_*_ؘ_*_ؙ_*_ؚ_*_؛_*_؜_*_؝_*_؞_*_؟_*_ؠ_*_ء_*_آ_*_أ_*_ؤ_*_إ_*_ئ_*_ا_*_ب_*_ة_*_ت_*_ث_*_ج_*_ح_*_خ_*_د_*_ذ_*_ر_*_ز_*_س_*_ش_*_ص_*_ض_*_ط_*_ظ_*_ع_*_غ_*_ػ_*_ؼ_*_ؽ_*_ؾ_*_ؿ_*_ـ_*_ف_*_ق_*_ك_*_ل_*_م_*_ن_*_ه_*_و_*_ى_*_ي_*_ً_*_ٌ_*_ٍ_*_َ_*_ُ_*_ِ_*_ّ_*_ْ_*_ٓ_*_ٔ_*_ٕ_*_ٖ_*_ٗ_*_٘_*_ٙ_*_ٚ_*_ٛ_*_ٜ_*_ٝ_*_ٞ_*_ٟ_*_٠_*_١_*_٢_*_٣_*_٤_*_٥_*_٦_*_٧_*_٨_*_٩_*_٪_*_٫_*_٬_*_٭_*_ٮ_*_ٯ_*_ٰ_*_ٱ_*_ٲ_*_ٳ_*_ٴ_*_ٵ_*_ٶ_*_ٷ_*_ٸ_*_ٹ_*_ٺ_*_ٻ_*_ټ_*_ٽ_*_پ_*_ٿ_*_ڀ_*_ځ_*_ڂ_*_ڃ_*_ڄ_*_څ_*_چ_*_ڇ_*_ڈ_*_ډ_*_ڊ_*_ڋ_*_ڌ_*_ڍ_*_ڎ_*_ڏ_*_ڐ_*_ڑ_*_ڒ_*_ړ_*_ڔ_*_ڕ_*_ږ_*_ڗ_*_ژ_*_ڙ_*_ښ_*_ڛ_*_ڜ_*_ڝ_*_ڞ_*_ڟ_*_ڠ_*_ڡ_*_ڢ_*_ڣ_*_ڤ_*_ڥ_*_ڦ_*_ڧ_*_ڨ_*_ک_*_ڪ_*_ګ_*_ڬ_*_ڭ_*_ڮ_*_گ_*_ڰ_*_ڱ_*_ڲ_*_ڳ_*_ڴ_*_ڵ_*_ڶ_*_ڷ_*_ڸ_*_ڹ_*_ں_*_ڻ_*_ڼ_*_ڽ_*_ھ_*_ڿ_*_ۀ_*_ہ_*_ۂ_*_ۃ_*_ۄ_*_ۅ_*_ۆ_*_ۇ_*_ۈ_*_ۉ_*_ۊ_*_ۋ_*_ی_*_ۍ_*_ێ_*_ۏ_*_ې_*_ۑ_*_ے_*_ۓ_*_۔_*_ە_*_ۖ_*_ۗ_*_ۘ_*_ۙ_*_ۚ_*_ۛ_*_ۜ_*_۝_*_۞_*_۟_*_۠_*_ۡ_*_ۢ_*_ۣ_*_ۤ_*_ۥ_*_ۦ_*_ۧ_*_ۨ_*_۩_*_۪_*_۫_*_۬_*_ۭ_*_ۮ_*_ۯ_*_۰_*_۱_*_۲_*_۳_*_۴_*_۵_*_۶_*_۷_*_۸_*_۹_*_ۺ_*_ۻ_*_ۼ_*_۽_*_۾_*_ۿ'
7
+
8
+ # U+0600 to U+06FF encoded using cp1252
9
+ BROKEN_AR = 'Ø€_*_؁_*_Ø‚_*_؃_*_Ø„_*_Ø…_*_؆_*_؇_*_؈_*_؉_*_ØŠ_*_Ø‹_*_ØŒ_*_؍_*_ØŽ_*_؏_*_ؐ_*_Ø‘_*_Ø’_*_Ø“_*_Ø”_*_Ø•_*_Ø–_*_Ø—_*_ؘ_*_Ø™_*_Øš_*_Ø›_*_Øœ_*_؝_*_Øž_*_ØŸ_*_Ø _*_Ø¡_*_Ø¢_*_Ø£_*_ؤ_*_Ø¥_*_ئ_*_ا_*_ب_*_Ø©_*_ت_*_Ø«_*_ج_*_Ø­_*_Ø®_*_د_*_ذ_*_ر_*_ز_*_س_*_Ø´_*_ص_*_ض_*_Ø·_*_ظ_*_ع_*_غ_*_Ø»_*_ؼ_*_ؽ_*_ؾ_*_Ø¿_*_Ù€_*_ف_*_Ù‚_*_Ùƒ_*_Ù„_*_Ù…_*_Ù†_*_Ù‡_*_Ùˆ_*_Ù‰_*_ÙŠ_*_Ù‹_*_ÙŒ_*_ٍ_*_ÙŽ_*_ُ_*_ِ_*_Ù‘_*_Ù’_*_Ù“_*_Ù”_*_Ù•_*_Ù–_*_Ù—_*_Ù˜_*_Ù™_*_Ùš_*_Ù›_*_Ùœ_*_ٝ_*_Ùž_*_ÙŸ_*_Ù _*_Ù¡_*_Ù¢_*_Ù£_*_Ù¤_*_Ù¥_*_Ù¦_*_Ù§_*_Ù¨_*_Ù©_*_Ùª_*_Ù«_*_Ù¬_*_Ù­_*_Ù®_*_Ù¯_*_Ù°_*_Ù±_*_Ù²_*_Ù³_*_Ù´_*_Ùµ_*_Ù¶_*_Ù·_*_Ù¸_*_Ù¹_*_Ùº_*_Ù»_*_Ù¼_*_Ù½_*_Ù¾_*_Ù¿_*_Ú€_*_ځ_*_Ú‚_*_Úƒ_*_Ú„_*_Ú…_*_Ú†_*_Ú‡_*_Úˆ_*_Ú‰_*_ÚŠ_*_Ú‹_*_ÚŒ_*_ڍ_*_ÚŽ_*_ڏ_*_ڐ_*_Ú‘_*_Ú’_*_Ú“_*_Ú”_*_Ú•_*_Ú–_*_Ú—_*_Ú˜_*_Ú™_*_Úš_*_Ú›_*_Úœ_*_ڝ_*_Úž_*_ÚŸ_*_Ú _*_Ú¡_*_Ú¢_*_Ú£_*_Ú¤_*_Ú¥_*_Ú¦_*_Ú§_*_Ú¨_*_Ú©_*_Úª_*_Ú«_*_Ú¬_*_Ú­_*_Ú®_*_Ú¯_*_Ú°_*_Ú±_*_Ú²_*_Ú³_*_Ú´_*_Úµ_*_Ú¶_*_Ú·_*_Ú¸_*_Ú¹_*_Úº_*_Ú»_*_Ú¼_*_Ú½_*_Ú¾_*_Ú¿_*_Û€_*_ہ_*_Û‚_*_Ûƒ_*_Û„_*_Û…_*_Û†_*_Û‡_*_Ûˆ_*_Û‰_*_ÛŠ_*_Û‹_*_ÛŒ_*_ۍ_*_ÛŽ_*_ۏ_*_ې_*_Û‘_*_Û’_*_Û“_*_Û”_*_Û•_*_Û–_*_Û—_*_Û˜_*_Û™_*_Ûš_*_Û›_*_Ûœ_*_۝_*_Ûž_*_ÛŸ_*_Û _*_Û¡_*_Û¢_*_Û£_*_Û¤_*_Û¥_*_Û¦_*_Û§_*_Û¨_*_Û©_*_Ûª_*_Û«_*_Û¬_*_Û­_*_Û®_*_Û¯_*_Û°_*_Û±_*_Û²_*_Û³_*_Û´_*_Ûµ_*_Û¶_*_Û·_*_Û¸_*_Û¹_*_Ûº_*_Û»_*_Û¼_*_Û½_*_Û¾_*_Û¿'
10
+
11
+ def initialize options = {}
12
+ @replace_char = options[:replace_char] || '?'
13
+
14
+ generate_mappings_hash
15
+ end
16
+
17
+ def show_mappings_hash
18
+ i = 0
19
+ justification = 0;
20
+ mappings = ''
21
+ ar = AR.split('_*_')
22
+
23
+ @map.each do |broken, good|
24
+
25
+ str = "[#{broken.join(', ')}]"
26
+
27
+ justification = str.length if str.length > justification
28
+ str = str.ljust justification
29
+
30
+ str += ' => '
31
+ str += "[#{good.join(', ')}]"
32
+ str += " # #{ar[i]}\n"
33
+
34
+ mappings += str
35
+
36
+ i += 1
37
+ end
38
+
39
+ puts mappings
40
+ end
41
+
42
+ def repair(str)
43
+ @broken = str.unpack('C*')
44
+ @repaired = []
45
+
46
+ while @broken.length > 0
47
+
48
+ # Try to use the mappings hash first and jump to the
49
+ # next sequence if we succeed
50
+ next if sequence_found_in_map
51
+
52
+ # Try to handle ASCII chars if they are not a part
53
+ # of a broken sequence
54
+ next if byte_not_broken
55
+
56
+ handle_unknown_byte
57
+ end
58
+
59
+ result = @repaired.pack('C*')
60
+ @repaired = []
61
+ result
62
+ end
63
+
64
+ private
65
+
66
+ def generate_mappings_hash
67
+ @map = {}
68
+ ar = AR.split('_*_')
69
+
70
+ BROKEN_AR.split('_*_').each_with_index do |c, i|
71
+ @map[ c.unpack('C*') ] = ar[i].unpack('C*')
72
+ end
73
+ end
74
+
75
+ def sequence_found_in_map
76
+ (1..4).each do |i|
77
+ broken_seq = @broken[0..i]
78
+ if @map.has_key?(broken_seq)
79
+ @repaired += @map[broken_seq]
80
+ @broken.slice!(0, i + 1)
81
+ return true
82
+ end
83
+ end
84
+ false
85
+ end
86
+
87
+ def byte_not_broken
88
+ if (
89
+ @broken.first != 195 && # The byte is not the beginning of a broken sequence
90
+ @broken.first < 256 # One byte char
91
+ ) || (
92
+ @broken.first == 195 && # The byte is the beginning of a sequence ...
93
+ !(152..155).include?(@broken[1]) # ... but the next one is not, so it's not a sequence!
94
+ )
95
+ # Add the byte to the repaired sequence and remove it from the broken one
96
+ @repaired << @broken.shift
97
+ end
98
+ end
99
+
100
+ def handle_unknown_byte
101
+ case
102
+ # Handle the case when the last 2 bytes are the beginning of
103
+ # a broken sequence but it's not found in the mappings hash.
104
+ # The best guess is that they're 2 one-byte chars.
105
+ #
106
+ # Handles: [195, (152 | 153 | 154 | 155)]
107
+ when @broken.length == 2
108
+ @repaired << @broken.first << @broken[1]
109
+ @broken.slice!(0, 2)
110
+
111
+ # If we are here, then we have no idea what is this byte!
112
+ # We will just use the replace_char to replace the unknown byte_not_broken
113
+ # in the repaired sequence
114
+ else
115
+ @repaired += @replace_char.bytes.to_a
116
+ end
117
+ end
118
+ end
@@ -0,0 +1,4 @@
1
+ # encoding: UTF-8
2
+ module Mosaheh
3
+ VERSION = "0.0.1"
4
+ end
data/lib/mosaheh.rb ADDED
@@ -0,0 +1,6 @@
1
+ # encoding: UTF-8
2
+ require "mosaheh/version"
3
+
4
+ module Mosaheh
5
+ autoload :Encoder, 'mosaheh/encoder'
6
+ end
data/mosaheh.gemspec ADDED
@@ -0,0 +1,22 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "mosaheh/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "mosaheh"
7
+ s.version = Mosaheh::VERSION
8
+ s.authors = ["Maher Sallam"]
9
+ s.email = ["maher@sallam.me"]
10
+ s.homepage = ""
11
+ s.summary = %q{An Arabic text re-encoder from latin1 (cp1252) to Arabic (UTF-8).}
12
+ s.description = %q{Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to repair Arabic data stored in MySql databases with the wrong encoding.}
13
+
14
+ s.rubyforge_project = "mosaheh"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ s.add_development_dependency "rspec"
22
+ end
data/spec/.rspec ADDED
@@ -0,0 +1 @@
1
+ --color
@@ -0,0 +1,17 @@
1
+ # encoding: UTF-8
2
+ require File.expand_path('../../spec_helper', __FILE__)
3
+
4
+ describe Mosaheh::Encoder do
5
+ let(:encoder) { Mosaheh::Encoder.new }
6
+
7
+ describe '#repair' do
8
+ it 'should repair the whole Arabic unicode codeblock' do
9
+ encoder.repair(Mosaheh::Encoder::BROKEN_AR).unpack('C*').should eq Mosaheh::Encoder::AR.unpack('C*')
10
+ end
11
+
12
+ it 'should not change ASCII chars' do
13
+ text = [*0..127].map(&:chr).join
14
+ encoder.repair(text).should == text
15
+ end
16
+ end
17
+ end
@@ -0,0 +1,9 @@
1
+ # encoding: UTF-8
2
+
3
+ require 'rubygems'
4
+ require 'rspec'
5
+
6
+ path = File.expand_path(File.dirname(__FILE__) + "/../lib/")
7
+ $LOAD_PATH.unshift(path) unless $LOAD_PATH.include?(path)
8
+
9
+ require 'mosaheh'
metadata ADDED
@@ -0,0 +1,70 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: mosaheh
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Maher Sallam
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-10-31 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: rspec
16
+ requirement: &21735460 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '0'
22
+ type: :development
23
+ prerelease: false
24
+ version_requirements: *21735460
25
+ description: Mosaheh repairs UTF-8 Arabic (U+0600 - U+06FF) text which has been mistakenly
26
+ saved as single-byte latin1 encoding (cp1252). The biggest usecase for it is to
27
+ repair Arabic data stored in MySql databases with the wrong encoding.
28
+ email:
29
+ - maher@sallam.me
30
+ executables: []
31
+ extensions: []
32
+ extra_rdoc_files: []
33
+ files:
34
+ - .gitignore
35
+ - Gemfile
36
+ - README.md
37
+ - Rakefile
38
+ - lib/mosaheh.rb
39
+ - lib/mosaheh/encoder.rb
40
+ - lib/mosaheh/version.rb
41
+ - mosaheh.gemspec
42
+ - spec/.rspec
43
+ - spec/mosaheh/encoder_spec.rb
44
+ - spec/spec_helper.rb
45
+ homepage: ''
46
+ licenses: []
47
+ post_install_message:
48
+ rdoc_options: []
49
+ require_paths:
50
+ - lib
51
+ required_ruby_version: !ruby/object:Gem::Requirement
52
+ none: false
53
+ requirements:
54
+ - - ! '>='
55
+ - !ruby/object:Gem::Version
56
+ version: '0'
57
+ required_rubygems_version: !ruby/object:Gem::Requirement
58
+ none: false
59
+ requirements:
60
+ - - ! '>='
61
+ - !ruby/object:Gem::Version
62
+ version: '0'
63
+ requirements: []
64
+ rubyforge_project: mosaheh
65
+ rubygems_version: 1.8.10
66
+ signing_key:
67
+ specification_version: 3
68
+ summary: An Arabic text re-encoder from latin1 (cp1252) to Arabic (UTF-8).
69
+ test_files: []
70
+ has_rdoc: