persian 0.0.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.editorconfig +9 -0
- data/.gitignore +51 -0
- data/.rspec +3 -0
- data/.rubocop.yml +32 -0
- data/.travis.yml +8 -0
- data/Gemfile +10 -0
- data/Rakefile +36 -0
- data/lib/persian/counter.rb +61 -0
- data/lib/persian/date.rb +150 -0
- data/lib/persian/dynamic.rb +38 -0
- data/lib/persian/list/alphabet.rb +107 -0
- data/lib/persian/list/character.rb +193 -0
- data/lib/persian/list/homonyms.rb +59 -0
- data/lib/persian/list/number.rb +168 -0
- data/lib/persian/num_text.rb +53 -0
- data/lib/persian/number.rb +81 -0
- data/lib/persian/text/keyboard.rb +22 -0
- data/lib/persian/text/text.rb +214 -0
- data/lib/persian/tokenizer.rb +56 -0
- data/lib/persian/unicode.rb +42 -0
- data/lib/persian/url.rb +25 -0
- data/lib/persian/version.rb +2 -1
- data/lib/persian.rb +16 -39
- data/persian.gemspec +26 -0
- data/readme.md +48 -0
- data/spec/counter_spec.rb +83 -0
- data/spec/dynamic_spec.rb +6 -0
- data/spec/num_text_spec.rb +17 -0
- data/spec/number_spec.rb +129 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/text_spec.rb +258 -0
- data/spec/tokenizer_spec.rb +31 -0
- data/spec/unicode_spec.rb +25 -0
- data/spec/url_spec.rb +11 -0
- metadata +42 -12
@@ -0,0 +1,193 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian class
|
4
|
+
module Persian
|
5
|
+
# Class text
|
6
|
+
class Text
|
7
|
+
include Alphabet
|
8
|
+
|
9
|
+
AR_FA_CHAR = {
|
10
|
+
KAF_ARABIC => KAF,
|
11
|
+
'دِ' => 'د',
|
12
|
+
'بِ' => 'ب',
|
13
|
+
'زِ' => 'ز',
|
14
|
+
'ذِ' => 'ذ',
|
15
|
+
'شِ' => 'ش',
|
16
|
+
'سِ' => 'س',
|
17
|
+
'ى' => 'ی',
|
18
|
+
YE_ARABIC => YE,
|
19
|
+
'ة' => 'ه',
|
20
|
+
'هٔ' => 'ه'
|
21
|
+
}.freeze
|
22
|
+
|
23
|
+
HAREKATS = [
|
24
|
+
AA, # Ae
|
25
|
+
EE, # E
|
26
|
+
OO, # O
|
27
|
+
AN, # An
|
28
|
+
EN, # En
|
29
|
+
ON, # On
|
30
|
+
SAKEN, # Saken
|
31
|
+
TASHDID # Tashdid
|
32
|
+
].freeze
|
33
|
+
|
34
|
+
BRACKETS = [
|
35
|
+
'[',
|
36
|
+
']',
|
37
|
+
'{',
|
38
|
+
'}',
|
39
|
+
'<',
|
40
|
+
'>',
|
41
|
+
'«',
|
42
|
+
'»'
|
43
|
+
].freeze
|
44
|
+
|
45
|
+
SIGNS = [
|
46
|
+
'!',
|
47
|
+
'@',
|
48
|
+
'#',
|
49
|
+
'$',
|
50
|
+
'%',
|
51
|
+
'&',
|
52
|
+
'*',
|
53
|
+
'~',
|
54
|
+
'`',
|
55
|
+
'\'',
|
56
|
+
'"',
|
57
|
+
':',
|
58
|
+
';',
|
59
|
+
'.',
|
60
|
+
'?',
|
61
|
+
'<',
|
62
|
+
'>',
|
63
|
+
'/',
|
64
|
+
'-',
|
65
|
+
'+',
|
66
|
+
'-',
|
67
|
+
'_',
|
68
|
+
'^',
|
69
|
+
MAD,
|
70
|
+
NOGHTE,
|
71
|
+
VIRGOOL,
|
72
|
+
NOGHTEVIRGOOL,
|
73
|
+
DONOGHTE,
|
74
|
+
TAAJOB,
|
75
|
+
SOAL,
|
76
|
+
BEALAVE,
|
77
|
+
DARSAD,
|
78
|
+
MENHA,
|
79
|
+
MOSAVI,
|
80
|
+
TAGHSIM,
|
81
|
+
ZARBDAR,
|
82
|
+
KESH
|
83
|
+
].freeze
|
84
|
+
|
85
|
+
END_VOWEL = [
|
86
|
+
HE_DOCHESHM,
|
87
|
+
ALEF,
|
88
|
+
VAV
|
89
|
+
].freeze
|
90
|
+
|
91
|
+
# Exchange Standard QWERTY Keyboard layout
|
92
|
+
EN_FA_KEYBOARD_CHAR = {
|
93
|
+
# Lowercase Letters
|
94
|
+
'q' => ZAD,
|
95
|
+
'w' => SAD,
|
96
|
+
'e' => THE,
|
97
|
+
'r' => QAF,
|
98
|
+
't' => FE,
|
99
|
+
'y' => GHEIN,
|
100
|
+
'u' => EIN,
|
101
|
+
'i' => HE_DOCHESHM,
|
102
|
+
'o' => KHE,
|
103
|
+
'p' => HE_JIMI,
|
104
|
+
'[' => JIM,
|
105
|
+
']' => CHE,
|
106
|
+
'\\' => '\\',
|
107
|
+
'a' => SHIN,
|
108
|
+
's' => SIN,
|
109
|
+
'd' => YE,
|
110
|
+
'f' => BE,
|
111
|
+
'g' => LAM,
|
112
|
+
'h' => ALEF,
|
113
|
+
'j' => TE,
|
114
|
+
'k' => NOON,
|
115
|
+
'l' => MIM,
|
116
|
+
';' => KAF,
|
117
|
+
'\'' => GAF,
|
118
|
+
'z' => ZA,
|
119
|
+
'x' => TA,
|
120
|
+
'c' => ZE,
|
121
|
+
'v' => RE,
|
122
|
+
'b' => ZAL,
|
123
|
+
'n' => DAL,
|
124
|
+
'm' => PE,
|
125
|
+
',' => VAV,
|
126
|
+
'.' => '.',
|
127
|
+
'/' => '/',
|
128
|
+
# Uppercase Letters
|
129
|
+
'Q' => 'ْ',
|
130
|
+
'W' => 'ٌ',
|
131
|
+
'E' => 'ٍ',
|
132
|
+
'R' => 'ً',
|
133
|
+
'T' => 'ُ',
|
134
|
+
'Y' => 'ِ',
|
135
|
+
'U' => 'َ',
|
136
|
+
'I' => 'ّ',
|
137
|
+
'O' => ']',
|
138
|
+
'P' => '[',
|
139
|
+
'{' => '}',
|
140
|
+
'}' => '{',
|
141
|
+
'|' => '|',
|
142
|
+
'A' => 'ؤ',
|
143
|
+
'S' => 'ئ',
|
144
|
+
'D' => 'ي',
|
145
|
+
'F' => 'إ',
|
146
|
+
'G' => 'أ',
|
147
|
+
'H' => 'آ',
|
148
|
+
'J' => 'ة',
|
149
|
+
'K' => '»',
|
150
|
+
'L' => '«',
|
151
|
+
':' => ':',
|
152
|
+
'"' => '؛',
|
153
|
+
'Z' => 'ك',
|
154
|
+
'X' => 'ٓ',
|
155
|
+
'C' => 'ژ',
|
156
|
+
'V' => 'ٰ',
|
157
|
+
'B' => '',
|
158
|
+
'N' => 'ٔ',
|
159
|
+
'M' => 'ء',
|
160
|
+
'<' => '>',
|
161
|
+
'>' => '<',
|
162
|
+
'?' => '؟',
|
163
|
+
# Numbers without shift key
|
164
|
+
'`' => '',
|
165
|
+
'1' => YEK,
|
166
|
+
'2' => DOW,
|
167
|
+
'3' => SE,
|
168
|
+
'4' => CHAHAR,
|
169
|
+
'5' => PANJ,
|
170
|
+
'6' => SHESH,
|
171
|
+
'7' => HAFT,
|
172
|
+
'8' => HASHT,
|
173
|
+
'9' => NOH,
|
174
|
+
'0' => SEFR,
|
175
|
+
'-' => '-',
|
176
|
+
'=' => '=',
|
177
|
+
# Numbers With Shift key
|
178
|
+
'~' => '÷',
|
179
|
+
'!' => '!',
|
180
|
+
'@' => '٬',
|
181
|
+
'#' => '٫',
|
182
|
+
'$' => '﷼',
|
183
|
+
'%' => '٪',
|
184
|
+
'^' => '×',
|
185
|
+
'&' => '،',
|
186
|
+
'*' => '*',
|
187
|
+
'(' => ')',
|
188
|
+
')' => '(',
|
189
|
+
'_' => 'ـ',
|
190
|
+
'+' => '+'
|
191
|
+
}.freeze
|
192
|
+
end
|
193
|
+
end
|
@@ -0,0 +1,59 @@
|
|
1
|
+
module Persian
|
2
|
+
# Homonyms of persian
|
3
|
+
module Homonyms
|
4
|
+
include Alphabet
|
5
|
+
|
6
|
+
T = [
|
7
|
+
TE,
|
8
|
+
TA
|
9
|
+
].freeze
|
10
|
+
|
11
|
+
S = [
|
12
|
+
THE,
|
13
|
+
SIN,
|
14
|
+
SAD
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
H = [
|
18
|
+
HE_JIMI,
|
19
|
+
HE_DOCHESHM
|
20
|
+
].freeze
|
21
|
+
|
22
|
+
Z = [
|
23
|
+
ZAL,
|
24
|
+
ZE,
|
25
|
+
ZA,
|
26
|
+
ZAD
|
27
|
+
].freeze
|
28
|
+
|
29
|
+
GH = [
|
30
|
+
GHEIN,
|
31
|
+
QAF
|
32
|
+
].freeze
|
33
|
+
|
34
|
+
# List of all Homonyms classified in a hash
|
35
|
+
ALL = {
|
36
|
+
T: T,
|
37
|
+
S: S,
|
38
|
+
H: H,
|
39
|
+
Z: Z,
|
40
|
+
GH: GH
|
41
|
+
}.freeze
|
42
|
+
|
43
|
+
# List of all Homonyms bulk in array
|
44
|
+
ALL_a = [
|
45
|
+
T, S, H, Z, GH
|
46
|
+
].flatten.freeze
|
47
|
+
|
48
|
+
# Hash reverse list of Homonyms
|
49
|
+
temp = {}
|
50
|
+
|
51
|
+
ALL.each do |key, value|
|
52
|
+
value.each do |i|
|
53
|
+
temp[i.to_s] = key
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
ALL_r = temp.freeze
|
58
|
+
end
|
59
|
+
end
|
@@ -0,0 +1,168 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian class
|
4
|
+
module Persian
|
5
|
+
# Number class
|
6
|
+
class Number
|
7
|
+
include Alphabet
|
8
|
+
|
9
|
+
EN_FA_NUM = {
|
10
|
+
# english numbers
|
11
|
+
ZERO => SEFR,
|
12
|
+
ONE => YEK,
|
13
|
+
TWO => DOW,
|
14
|
+
THREE => SE,
|
15
|
+
FOUR => CHAHAR,
|
16
|
+
FIVE => PANJ,
|
17
|
+
SIX => SHESH,
|
18
|
+
SEVEN => HAFT,
|
19
|
+
EIGHT => HASHT,
|
20
|
+
NINE => NOH
|
21
|
+
}.freeze
|
22
|
+
|
23
|
+
FA_EN_NUM = {
|
24
|
+
# english numbers
|
25
|
+
SEFR => ZERO,
|
26
|
+
YEK => ONE,
|
27
|
+
DOW => TWO,
|
28
|
+
SE => THREE,
|
29
|
+
CHAHAR => FOUR,
|
30
|
+
PANJ => FIVE,
|
31
|
+
SHESH => SIX,
|
32
|
+
HAFT => SEVEN,
|
33
|
+
HASHT => EIGHT,
|
34
|
+
NOH => NINE
|
35
|
+
}.freeze
|
36
|
+
|
37
|
+
AR_FA_NUM = {
|
38
|
+
# arabic numbers
|
39
|
+
SIFR => SEFR,
|
40
|
+
WAHID => YEK,
|
41
|
+
ATHNAN => DOW,
|
42
|
+
THALETH => SE,
|
43
|
+
ARBE => CHAHAR,
|
44
|
+
KHAMSE => PANJ,
|
45
|
+
SETE => SHESH,
|
46
|
+
SABE => HAFT,
|
47
|
+
THMANY => HASHT,
|
48
|
+
LAYS => NOH
|
49
|
+
}.freeze
|
50
|
+
|
51
|
+
FA_AR_NUM = {
|
52
|
+
# arabic numbers
|
53
|
+
SEFR => SIFR,
|
54
|
+
YEK => WAHID,
|
55
|
+
DOW => ATHNAN,
|
56
|
+
SE => THALETH,
|
57
|
+
CHAHAR => ARBE,
|
58
|
+
PANJ => KHAMSE,
|
59
|
+
SHESH => SETE,
|
60
|
+
HAFT => SABE,
|
61
|
+
HASHT => THMANY,
|
62
|
+
NOH => LAYS
|
63
|
+
}.freeze
|
64
|
+
|
65
|
+
AR_EN_NUM = {
|
66
|
+
SIFR => ZERO,
|
67
|
+
WAHID => ONE,
|
68
|
+
ATHNAN => TWO,
|
69
|
+
THALETH => THREE,
|
70
|
+
ARBE => FOUR,
|
71
|
+
KHAMSE => FIVE,
|
72
|
+
SETE => SIX,
|
73
|
+
SABE => SEVEN,
|
74
|
+
THMANY => EIGHT,
|
75
|
+
LAYS => NINE
|
76
|
+
}.freeze
|
77
|
+
|
78
|
+
EN_AR_NUM = {
|
79
|
+
ZERO => SIFR,
|
80
|
+
ONE => WAHID,
|
81
|
+
TWO => ATHNAN,
|
82
|
+
THREE => THALETH,
|
83
|
+
FOUR => ARBE,
|
84
|
+
FIVE => KHAMSE,
|
85
|
+
SIX => SETE,
|
86
|
+
SEVEN => SABE,
|
87
|
+
EIGHT => THMANY,
|
88
|
+
NINE => LAYS
|
89
|
+
}.freeze
|
90
|
+
|
91
|
+
ONES = [
|
92
|
+
'صفر',
|
93
|
+
'یک',
|
94
|
+
'دو',
|
95
|
+
'سه',
|
96
|
+
'چهار',
|
97
|
+
'پنج',
|
98
|
+
'شش',
|
99
|
+
'هفت',
|
100
|
+
'هشت',
|
101
|
+
'نه'
|
102
|
+
].freeze
|
103
|
+
|
104
|
+
TEENS = [
|
105
|
+
'ده',
|
106
|
+
'یازده',
|
107
|
+
'دوازده',
|
108
|
+
'سیزده',
|
109
|
+
'چهارده',
|
110
|
+
'پانزده',
|
111
|
+
'شانزده',
|
112
|
+
'هفده',
|
113
|
+
'هجده',
|
114
|
+
'نوزده'
|
115
|
+
].freeze
|
116
|
+
|
117
|
+
DECIMAL = [
|
118
|
+
'',
|
119
|
+
'',
|
120
|
+
'بیست',
|
121
|
+
'سی',
|
122
|
+
'چهل',
|
123
|
+
'پنجاه',
|
124
|
+
'شصت',
|
125
|
+
'هفتاد',
|
126
|
+
'هشتاد',
|
127
|
+
'نود'
|
128
|
+
].freeze
|
129
|
+
|
130
|
+
HUNDREDS = [
|
131
|
+
'',
|
132
|
+
'صد',
|
133
|
+
'دویست',
|
134
|
+
'سیصد',
|
135
|
+
'چهارصد',
|
136
|
+
'پانصد',
|
137
|
+
'ششصد',
|
138
|
+
'هفتصد',
|
139
|
+
'هشتصد',
|
140
|
+
'نهصد'
|
141
|
+
].freeze
|
142
|
+
|
143
|
+
LONGSCALE = [
|
144
|
+
'',
|
145
|
+
'هزار',
|
146
|
+
'میلیون',
|
147
|
+
'میلیارد',
|
148
|
+
'بیلیون',
|
149
|
+
'بیلیارد',
|
150
|
+
'تریلیون',
|
151
|
+
'تریلیارد',
|
152
|
+
'کوآدریلیون',
|
153
|
+
'کادریلیارد',
|
154
|
+
'کوینتیلیون',
|
155
|
+
'کوانتینیارد',
|
156
|
+
'سکستیلیون',
|
157
|
+
'سکستیلیارد',
|
158
|
+
'سپتیلیون',
|
159
|
+
'سپتیلیارد',
|
160
|
+
'اکتیلیون',
|
161
|
+
'اکتیلیارد',
|
162
|
+
'نانیلیون',
|
163
|
+
'نانیلیارد',
|
164
|
+
'دسیلیون',
|
165
|
+
'دسیلیارد'
|
166
|
+
].freeze
|
167
|
+
end
|
168
|
+
end
|
@@ -0,0 +1,53 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian NumText class
|
6
|
+
# Work with number in words
|
7
|
+
class NumText < Number
|
8
|
+
def self.num_to_char(num, inner = false)
|
9
|
+
num = Persian::Number.number(num, lang: 'en', return: 'int') if num.is_a? String
|
10
|
+
|
11
|
+
words = ''
|
12
|
+
|
13
|
+
if ((num / 100) >= 0) && ((num / 100) < 10)
|
14
|
+
|
15
|
+
if num < 10
|
16
|
+
if num.zero? && inner
|
17
|
+
words += ''
|
18
|
+
elsif inner
|
19
|
+
words += ' و '
|
20
|
+
words += ONES[num]
|
21
|
+
else
|
22
|
+
words += ONES[num]
|
23
|
+
end
|
24
|
+
|
25
|
+
elsif num < 20
|
26
|
+
words += TEENS[num - 10]
|
27
|
+
elsif num < 100
|
28
|
+
words += ' و ' if inner
|
29
|
+
|
30
|
+
words += DECIMAL[num / 10] + num_to_char(num % 10, true)
|
31
|
+
else
|
32
|
+
words += ' و ' if inner
|
33
|
+
|
34
|
+
words += HUNDREDS[num / 100] + num_to_char(num % 100, true)
|
35
|
+
end
|
36
|
+
else
|
37
|
+
words += ' و ' if inner
|
38
|
+
|
39
|
+
if (num.to_s.length % 3).nonzero?
|
40
|
+
current_split = num.to_s[0..(num.to_s.length % 3 - 1).to_i]
|
41
|
+
more_split = num.to_s.sub(current_split, '')
|
42
|
+
words += num_to_char(current_split) + ' ' + LONGSCALE[(num.to_s.length / 3)] + num_to_char(more_split, true)
|
43
|
+
else
|
44
|
+
current_split = num.to_s[0..2]
|
45
|
+
more_split = num.to_s.sub(current_split, '')
|
46
|
+
words += num_to_char(current_split) + ' ' + LONGSCALE[(num.to_s.length / 3) - 1] + num_to_char(more_split, true)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
words
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
@@ -0,0 +1,81 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian Number class
|
6
|
+
# Init an instance with pass a number
|
7
|
+
# Basic operators works with persian string of numbers
|
8
|
+
class Number
|
9
|
+
def initialize(num)
|
10
|
+
@value = Persian::Number.number(num)
|
11
|
+
end
|
12
|
+
|
13
|
+
def +(other)
|
14
|
+
Number.to_persian(Number.to_i(@value) + Number.to_i(other))
|
15
|
+
end
|
16
|
+
|
17
|
+
def -(other)
|
18
|
+
Number.to_persian(Number.to_i(@value) - Number.to_i(other))
|
19
|
+
end
|
20
|
+
|
21
|
+
def *(other)
|
22
|
+
Number.to_persian(Number.to_i(@value) * Number.to_i(other))
|
23
|
+
end
|
24
|
+
|
25
|
+
def /(other)
|
26
|
+
Number.to_persian(Number.to_i(@value) / Number.to_i(other))
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.number(num, opts = { lang: 'fa', return: 'string' })
|
30
|
+
opts[:lang] = 'fa' if opts[:lang].nil?
|
31
|
+
opts[:return] = 'string' if opts[:return].nil?
|
32
|
+
num = num.to_s if num.is_a? Numeric
|
33
|
+
|
34
|
+
if opts[:lang] == 'fa' || opts[:lang].nil?
|
35
|
+
nums = EN_FA_NUM.merge(AR_FA_NUM)
|
36
|
+
elsif opts[:lang] == 'en'
|
37
|
+
nums = FA_EN_NUM.merge(AR_EN_NUM)
|
38
|
+
elsif opts[:lang] == 'ar'
|
39
|
+
nums = FA_AR_NUM.merge(EN_AR_NUM)
|
40
|
+
end
|
41
|
+
|
42
|
+
nums.each { |k, v| num.gsub!(k, v) }
|
43
|
+
|
44
|
+
if opts[:return]
|
45
|
+
case opts[:return]
|
46
|
+
when 'string'
|
47
|
+
num
|
48
|
+
when 'int'
|
49
|
+
num.to_i
|
50
|
+
end
|
51
|
+
else
|
52
|
+
num
|
53
|
+
end
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.number_with_colon(num)
|
57
|
+
num = number(num)
|
58
|
+
num.reverse.gsub(/(\S{3})(?=\S)/, '\\1,').reverse
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.to_persian(num)
|
62
|
+
number(num, lang: 'fa')
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.to_english(num)
|
66
|
+
number(num, lang: 'en')
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.to_arabic(num)
|
70
|
+
number(num, lang: 'ar')
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.to_i(num)
|
74
|
+
number(num, lang: 'en', return: 'int')
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.random(params = nil)
|
78
|
+
number(rand(params))
|
79
|
+
end
|
80
|
+
end
|
81
|
+
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian Text class
|
6
|
+
# Digest Persian texts
|
7
|
+
class Text
|
8
|
+
# Replace english characters with it's key persian value on standard persian keyboard
|
9
|
+
# For now just support QWERTY keyboard
|
10
|
+
def self.english_to_persian_char(text)
|
11
|
+
EN_FA_KEYBOARD_CHAR.each { |k, v| text.gsub!(k, v) }
|
12
|
+
text
|
13
|
+
end
|
14
|
+
|
15
|
+
# Replace standard persian keyboard characters with it's key persian value on english keyboard
|
16
|
+
# For now just support QWERTY keyboard
|
17
|
+
def self.persian_to_english_char(text)
|
18
|
+
EN_FA_KEYBOARD_CHAR.each { |v, k| text.gsub!(k, v) }
|
19
|
+
text
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|