persian 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +9 -0
- data/.gitignore +51 -0
- data/.rspec +3 -0
- data/.rubocop.yml +29 -0
- data/.travis.yml +8 -0
- data/Gemfile +10 -0
- data/Rakefile +36 -0
- data/lib/persian.rb +15 -9
- data/lib/persian/counter.rb +61 -0
- data/lib/persian/date.rb +150 -0
- data/lib/persian/dynamic.rb +38 -0
- data/lib/persian/list/alphabet.rb +107 -0
- data/lib/persian/list/character.rb +193 -0
- data/lib/persian/list/number.rb +154 -149
- data/lib/persian/num_text.rb +53 -0
- data/lib/persian/number.rb +69 -20
- data/lib/persian/text/keyboard.rb +22 -0
- data/lib/persian/text/text.rb +197 -0
- data/lib/persian/tokenizer.rb +42 -0
- data/lib/persian/unicode.rb +42 -0
- data/lib/persian/url.rb +25 -0
- data/lib/persian/version.rb +2 -1
- data/persian.gemspec +26 -0
- data/readme.md +47 -0
- data/spec/counter_spec.rb +83 -0
- data/spec/dynamic_spec.rb +6 -0
- data/spec/num_text_spec.rb +17 -0
- data/spec/number_spec.rb +129 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/text_spec.rb +236 -0
- data/spec/tokenizer_spec.rb +23 -0
- data/spec/unicode_spec.rb +25 -0
- data/spec/url_spec.rb +11 -0
- metadata +38 -12
- data/lib/persian/character.rb +0 -26
- data/lib/persian/num_to_char.rb +0 -60
@@ -0,0 +1,107 @@
|
|
1
|
+
module Persian
|
2
|
+
module Alphabet
|
3
|
+
ALEF = 'ا'.freeze
|
4
|
+
ALEF_MAD = 'آ'.freeze
|
5
|
+
BE = 'ب'.freeze
|
6
|
+
PE = 'پ'.freeze
|
7
|
+
TE = 'ت'.freeze
|
8
|
+
THE = 'ث'.freeze
|
9
|
+
JIM = 'ج'.freeze
|
10
|
+
CHE = 'چ'.freeze
|
11
|
+
HE_JIMI = 'ح'.freeze
|
12
|
+
KHE = 'خ'.freeze
|
13
|
+
DAL = 'د'.freeze
|
14
|
+
ZAL = 'ذ'.freeze
|
15
|
+
RE = 'ر'.freeze
|
16
|
+
ZE = 'ز'.freeze
|
17
|
+
ZHE = 'ژ'.freeze
|
18
|
+
SIN = 'س'.freeze
|
19
|
+
SHIN = 'ش'.freeze
|
20
|
+
SAD = 'ص'.freeze
|
21
|
+
ZAD = 'ض'.freeze
|
22
|
+
TA = 'ط'.freeze
|
23
|
+
ZA = 'ظ'.freeze
|
24
|
+
EIN = 'ع'.freeze
|
25
|
+
GHEIN = 'غ'.freeze
|
26
|
+
FE = 'ف'.freeze
|
27
|
+
QAF = 'ق'.freeze
|
28
|
+
KAF = 'ک'.freeze
|
29
|
+
GAF = 'گ'.freeze
|
30
|
+
LAM = 'ل'.freeze
|
31
|
+
MIM = 'م'.freeze
|
32
|
+
NOON = 'ن'.freeze
|
33
|
+
VAV = 'و'.freeze
|
34
|
+
HE_DOCHESHM = 'ه'.freeze
|
35
|
+
YE = 'ی'.freeze
|
36
|
+
|
37
|
+
KAF_ARABIC = 'ك'.freeze
|
38
|
+
YE_ARABIC = 'ي'.freeze
|
39
|
+
|
40
|
+
MAD = 'ٓ'.freeze
|
41
|
+
|
42
|
+
AA = 'َ'.freeze
|
43
|
+
EE = 'ِ'.freeze
|
44
|
+
OO = 'ُ'.freeze
|
45
|
+
|
46
|
+
AN = 'ً'.freeze
|
47
|
+
EN = 'ٍ'.freeze
|
48
|
+
ON = 'ٌ'.freeze
|
49
|
+
|
50
|
+
SAKEN = 'ْ'.freeze
|
51
|
+
TASHDID = 'ّ'.freeze
|
52
|
+
|
53
|
+
SPACE = ' '.freeze
|
54
|
+
ZWNJ = ''.freeze
|
55
|
+
ZWJ = ''.freeze
|
56
|
+
|
57
|
+
NOGHTE = '.'.freeze
|
58
|
+
VIRGOOL = '،'.freeze
|
59
|
+
DONOGHTE = ':'.freeze
|
60
|
+
NOGHTEVIRGOOL = '؛'.freeze
|
61
|
+
|
62
|
+
TAAJOB = '!'.freeze
|
63
|
+
SOAL = '؟'.freeze
|
64
|
+
|
65
|
+
BEALAVE = '+'.freeze
|
66
|
+
DARSAD = '٪'.freeze
|
67
|
+
MENHA = '-'.freeze
|
68
|
+
MOSAVI = '='.freeze
|
69
|
+
TAGHSIM = '÷'.freeze
|
70
|
+
ZARBDAR = '×'.freeze
|
71
|
+
|
72
|
+
KESH = 'ـ'.freeze
|
73
|
+
|
74
|
+
SEFR = '۰'.freeze
|
75
|
+
YEK = '۱'.freeze
|
76
|
+
DOW = '۲'.freeze
|
77
|
+
SE = '۳'.freeze
|
78
|
+
CHAHAR = '۴'.freeze
|
79
|
+
PANJ = '۵'.freeze
|
80
|
+
SHESH = '۶'.freeze
|
81
|
+
HAFT = '۷'.freeze
|
82
|
+
HASHT = '۸'.freeze
|
83
|
+
NOH = '۹'.freeze
|
84
|
+
|
85
|
+
SIFR = '٠'.freeze
|
86
|
+
WAHID = '١'.freeze
|
87
|
+
ATHNAN = '٢'.freeze
|
88
|
+
THALETH = '٣'.freeze
|
89
|
+
ARBE = '٤'.freeze
|
90
|
+
KHAMSE = '٥'.freeze
|
91
|
+
SETE = '٦'.freeze
|
92
|
+
SABE = '٧'.freeze
|
93
|
+
THMANY = '٨'.freeze
|
94
|
+
LAYS = '٩'.freeze
|
95
|
+
|
96
|
+
ZERO = '0'.freeze
|
97
|
+
ONE = '1'.freeze
|
98
|
+
TWO = '2'.freeze
|
99
|
+
THREE = '3'.freeze
|
100
|
+
FOUR = '4'.freeze
|
101
|
+
FIVE = '5'.freeze
|
102
|
+
SIX = '6'.freeze
|
103
|
+
SEVEN = '7'.freeze
|
104
|
+
EIGHT = '8'.freeze
|
105
|
+
NINE = '9'.freeze
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,193 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian class
|
4
|
+
module Persian
|
5
|
+
# Class text
|
6
|
+
class Text
|
7
|
+
include Alphabet
|
8
|
+
|
9
|
+
AR_FA_CHAR = {
|
10
|
+
KAF_ARABIC => KAF,
|
11
|
+
'دِ' => 'د',
|
12
|
+
'بِ' => 'ب',
|
13
|
+
'زِ' => 'ز',
|
14
|
+
'ذِ' => 'ذ',
|
15
|
+
'شِ' => 'ش',
|
16
|
+
'سِ' => 'س',
|
17
|
+
'ى' => 'ی',
|
18
|
+
YE_ARABIC => YE,
|
19
|
+
'ة' => 'ه',
|
20
|
+
'هٔ' => 'ه'
|
21
|
+
}.freeze
|
22
|
+
|
23
|
+
HAREKATS = [
|
24
|
+
AA, # Ae
|
25
|
+
EE, # E
|
26
|
+
OO, # O
|
27
|
+
AN, # An
|
28
|
+
EN, # En
|
29
|
+
ON, # On
|
30
|
+
SAKEN, # Saken
|
31
|
+
TASHDID # Tashdid
|
32
|
+
].freeze
|
33
|
+
|
34
|
+
BRACKETS = [
|
35
|
+
'[',
|
36
|
+
']',
|
37
|
+
'{',
|
38
|
+
'}',
|
39
|
+
'<',
|
40
|
+
'>',
|
41
|
+
'«',
|
42
|
+
'»'
|
43
|
+
].freeze
|
44
|
+
|
45
|
+
SIGNS = [
|
46
|
+
'!',
|
47
|
+
'@',
|
48
|
+
'#',
|
49
|
+
'$',
|
50
|
+
'%',
|
51
|
+
'&',
|
52
|
+
'*',
|
53
|
+
'~',
|
54
|
+
'`',
|
55
|
+
'\'',
|
56
|
+
'"',
|
57
|
+
':',
|
58
|
+
';',
|
59
|
+
'.',
|
60
|
+
'?',
|
61
|
+
'<',
|
62
|
+
'>',
|
63
|
+
'/',
|
64
|
+
'-',
|
65
|
+
'+',
|
66
|
+
'-',
|
67
|
+
'_',
|
68
|
+
'^',
|
69
|
+
MAD,
|
70
|
+
NOGHTE,
|
71
|
+
VIRGOOL,
|
72
|
+
NOGHTEVIRGOOL,
|
73
|
+
DONOGHTE,
|
74
|
+
TAAJOB,
|
75
|
+
SOAL,
|
76
|
+
BEALAVE,
|
77
|
+
DARSAD,
|
78
|
+
MENHA,
|
79
|
+
MOSAVI,
|
80
|
+
TAGHSIM,
|
81
|
+
ZARBDAR,
|
82
|
+
KESH
|
83
|
+
].freeze
|
84
|
+
|
85
|
+
END_VOWEL = [
|
86
|
+
HE_DOCHESHM,
|
87
|
+
ALEF,
|
88
|
+
VAV
|
89
|
+
].freeze
|
90
|
+
|
91
|
+
# Exchange Standard QWERTY Keyboard layout
|
92
|
+
EN_FA_KEYBOARD_CHAR = {
|
93
|
+
# Lowercase Letters
|
94
|
+
'q' => ZAD,
|
95
|
+
'w' => SAD,
|
96
|
+
'e' => THE,
|
97
|
+
'r' => QAF,
|
98
|
+
't' => FE,
|
99
|
+
'y' => GHEIN,
|
100
|
+
'u' => EIN,
|
101
|
+
'i' => HE_DOCHESHM,
|
102
|
+
'o' => KHE,
|
103
|
+
'p' => HE_JIMI,
|
104
|
+
'[' => JIM,
|
105
|
+
']' => CHE,
|
106
|
+
'\\' => '\\',
|
107
|
+
'a' => SHIN,
|
108
|
+
's' => SIN,
|
109
|
+
'd' => YE,
|
110
|
+
'f' => BE,
|
111
|
+
'g' => LAM,
|
112
|
+
'h' => ALEF,
|
113
|
+
'j' => TE,
|
114
|
+
'k' => NOON,
|
115
|
+
'l' => MIM,
|
116
|
+
';' => KAF,
|
117
|
+
'\'' => GAF,
|
118
|
+
'z' => ZA,
|
119
|
+
'x' => TA,
|
120
|
+
'c' => ZE,
|
121
|
+
'v' => RE,
|
122
|
+
'b' => ZAL,
|
123
|
+
'n' => DAL,
|
124
|
+
'm' => PE,
|
125
|
+
',' => VAV,
|
126
|
+
'.' => '.',
|
127
|
+
'/' => '/',
|
128
|
+
# Uppercase Letters
|
129
|
+
'Q' => 'ْ',
|
130
|
+
'W' => 'ٌ',
|
131
|
+
'E' => 'ٍ',
|
132
|
+
'R' => 'ً',
|
133
|
+
'T' => 'ُ',
|
134
|
+
'Y' => 'ِ',
|
135
|
+
'U' => 'َ',
|
136
|
+
'I' => 'ّ',
|
137
|
+
'O' => ']',
|
138
|
+
'P' => '[',
|
139
|
+
'{' => '}',
|
140
|
+
'}' => '{',
|
141
|
+
'|' => '|',
|
142
|
+
'A' => 'ؤ',
|
143
|
+
'S' => 'ئ',
|
144
|
+
'D' => 'ي',
|
145
|
+
'F' => 'إ',
|
146
|
+
'G' => 'أ',
|
147
|
+
'H' => 'آ',
|
148
|
+
'J' => 'ة',
|
149
|
+
'K' => '»',
|
150
|
+
'L' => '«',
|
151
|
+
':' => ':',
|
152
|
+
'"' => '؛',
|
153
|
+
'Z' => 'ك',
|
154
|
+
'X' => 'ٓ',
|
155
|
+
'C' => 'ژ',
|
156
|
+
'V' => 'ٰ',
|
157
|
+
'B' => '',
|
158
|
+
'N' => 'ٔ',
|
159
|
+
'M' => 'ء',
|
160
|
+
'<' => '>',
|
161
|
+
'>' => '<',
|
162
|
+
'?' => '؟',
|
163
|
+
# Numbers without shift key
|
164
|
+
'`' => '',
|
165
|
+
'1' => YEK,
|
166
|
+
'2' => DOW,
|
167
|
+
'3' => SE,
|
168
|
+
'4' => CHAHAR,
|
169
|
+
'5' => PANJ,
|
170
|
+
'6' => SHESH,
|
171
|
+
'7' => HAFT,
|
172
|
+
'8' => HASHT,
|
173
|
+
'9' => NOH,
|
174
|
+
'0' => SEFR,
|
175
|
+
'-' => '-',
|
176
|
+
'=' => '=',
|
177
|
+
# Numbers With Shift key
|
178
|
+
'~' => '÷',
|
179
|
+
'!' => '!',
|
180
|
+
'@' => '٬',
|
181
|
+
'#' => '٫',
|
182
|
+
'$' => '﷼',
|
183
|
+
'%' => '٪',
|
184
|
+
'^' => '×',
|
185
|
+
'&' => '،',
|
186
|
+
'*' => '*',
|
187
|
+
'(' => ')',
|
188
|
+
')' => '(',
|
189
|
+
'_' => 'ـ',
|
190
|
+
'+' => '+'
|
191
|
+
}.freeze
|
192
|
+
end
|
193
|
+
end
|
data/lib/persian/list/number.rb
CHANGED
@@ -1,163 +1,168 @@
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
2
2
|
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
"2" => "۲",
|
9
|
-
"3" => "۳",
|
10
|
-
"4" => "۴",
|
11
|
-
"5" => "۵",
|
12
|
-
"6" => "۶",
|
13
|
-
"7" => "۷",
|
14
|
-
"8" => "۸",
|
15
|
-
"9" => "۹",
|
16
|
-
}
|
3
|
+
# Persian class
|
4
|
+
module Persian
|
5
|
+
# Number class
|
6
|
+
class Number
|
7
|
+
include Alphabet
|
17
8
|
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
9
|
+
EN_FA_NUM = {
|
10
|
+
# english numbers
|
11
|
+
ZERO => SEFR,
|
12
|
+
ONE => YEK,
|
13
|
+
TWO => DOW,
|
14
|
+
THREE => SE,
|
15
|
+
FOUR => CHAHAR,
|
16
|
+
FIVE => PANJ,
|
17
|
+
SIX => SHESH,
|
18
|
+
SEVEN => HAFT,
|
19
|
+
EIGHT => HASHT,
|
20
|
+
NINE => NOH
|
21
|
+
}.freeze
|
31
22
|
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
|
44
|
-
|
23
|
+
FA_EN_NUM = {
|
24
|
+
# english numbers
|
25
|
+
SEFR => ZERO,
|
26
|
+
YEK => ONE,
|
27
|
+
DOW => TWO,
|
28
|
+
SE => THREE,
|
29
|
+
CHAHAR => FOUR,
|
30
|
+
PANJ => FIVE,
|
31
|
+
SHESH => SIX,
|
32
|
+
HAFT => SEVEN,
|
33
|
+
HASHT => EIGHT,
|
34
|
+
NOH => NINE
|
35
|
+
}.freeze
|
45
36
|
|
46
|
-
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
37
|
+
AR_FA_NUM = {
|
38
|
+
# arabic numbers
|
39
|
+
SIFR => SEFR,
|
40
|
+
WAHID => YEK,
|
41
|
+
ATHNAN => DOW,
|
42
|
+
THALETH => SE,
|
43
|
+
ARBE => CHAHAR,
|
44
|
+
KHAMSE => PANJ,
|
45
|
+
SETE => SHESH,
|
46
|
+
SABE => HAFT,
|
47
|
+
THMANY => HASHT,
|
48
|
+
LAYS => NOH
|
49
|
+
}.freeze
|
59
50
|
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
51
|
+
FA_AR_NUM = {
|
52
|
+
# arabic numbers
|
53
|
+
SEFR => SIFR,
|
54
|
+
YEK => WAHID,
|
55
|
+
DOW => ATHNAN,
|
56
|
+
SE => THALETH,
|
57
|
+
CHAHAR => ARBE,
|
58
|
+
PANJ => KHAMSE,
|
59
|
+
SHESH => SETE,
|
60
|
+
HAFT => SABE,
|
61
|
+
HASHT => THMANY,
|
62
|
+
NOH => LAYS
|
63
|
+
}.freeze
|
72
64
|
|
73
|
-
|
74
|
-
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
65
|
+
AR_EN_NUM = {
|
66
|
+
SIFR => ZERO,
|
67
|
+
WAHID => ONE,
|
68
|
+
ATHNAN => TWO,
|
69
|
+
THALETH => THREE,
|
70
|
+
ARBE => FOUR,
|
71
|
+
KHAMSE => FIVE,
|
72
|
+
SETE => SIX,
|
73
|
+
SABE => SEVEN,
|
74
|
+
THMANY => EIGHT,
|
75
|
+
LAYS => NINE
|
76
|
+
}.freeze
|
85
77
|
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
78
|
+
EN_AR_NUM = {
|
79
|
+
ZERO => SIFR,
|
80
|
+
ONE => WAHID,
|
81
|
+
TWO => ATHNAN,
|
82
|
+
THREE => THALETH,
|
83
|
+
FOUR => ARBE,
|
84
|
+
FIVE => KHAMSE,
|
85
|
+
SIX => SETE,
|
86
|
+
SEVEN => SABE,
|
87
|
+
EIGHT => THMANY,
|
88
|
+
NINE => LAYS
|
89
|
+
}.freeze
|
98
90
|
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
|
109
|
-
|
110
|
-
|
91
|
+
ONES = [
|
92
|
+
'صفر',
|
93
|
+
'یک',
|
94
|
+
'دو',
|
95
|
+
'سه',
|
96
|
+
'چهار',
|
97
|
+
'پنج',
|
98
|
+
'شش',
|
99
|
+
'هفت',
|
100
|
+
'هشت',
|
101
|
+
'نه'
|
102
|
+
].freeze
|
111
103
|
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
|
122
|
-
|
123
|
-
|
104
|
+
TEENS = [
|
105
|
+
'ده',
|
106
|
+
'یازده',
|
107
|
+
'دوازده',
|
108
|
+
'سیزده',
|
109
|
+
'چهارده',
|
110
|
+
'پانزده',
|
111
|
+
'شانزده',
|
112
|
+
'هفده',
|
113
|
+
'هجده',
|
114
|
+
'نوزده'
|
115
|
+
].freeze
|
124
116
|
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
117
|
+
DECIMAL = [
|
118
|
+
'',
|
119
|
+
'',
|
120
|
+
'بیست',
|
121
|
+
'سی',
|
122
|
+
'چهل',
|
123
|
+
'پنجاه',
|
124
|
+
'شصت',
|
125
|
+
'هفتاد',
|
126
|
+
'هشتاد',
|
127
|
+
'نود'
|
128
|
+
].freeze
|
137
129
|
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
"کوانتینیارد",
|
151
|
-
"سکستیلیون",
|
152
|
-
"سکستیلیارد",
|
153
|
-
"سپتیلیون",
|
154
|
-
"سپتیلیارد",
|
155
|
-
"اکتیلیون",
|
156
|
-
"اکتیلیارد",
|
157
|
-
"نانیلیون",
|
158
|
-
"نانیلیارد",
|
159
|
-
"دسیلیون",
|
160
|
-
"دسیلیارد"
|
161
|
-
]
|
130
|
+
HUNDREDS = [
|
131
|
+
'',
|
132
|
+
'صد',
|
133
|
+
'دویست',
|
134
|
+
'سیصد',
|
135
|
+
'چهارصد',
|
136
|
+
'پانصد',
|
137
|
+
'ششصد',
|
138
|
+
'هفتصد',
|
139
|
+
'هشتصد',
|
140
|
+
'نهصد'
|
141
|
+
].freeze
|
162
142
|
|
143
|
+
LONGSCALE = [
|
144
|
+
'',
|
145
|
+
'هزار',
|
146
|
+
'میلیون',
|
147
|
+
'میلیارد',
|
148
|
+
'بیلیون',
|
149
|
+
'بیلیارد',
|
150
|
+
'تریلیون',
|
151
|
+
'تریلیارد',
|
152
|
+
'کوآدریلیون',
|
153
|
+
'کادریلیارد',
|
154
|
+
'کوینتیلیون',
|
155
|
+
'کوانتینیارد',
|
156
|
+
'سکستیلیون',
|
157
|
+
'سکستیلیارد',
|
158
|
+
'سپتیلیون',
|
159
|
+
'سپتیلیارد',
|
160
|
+
'اکتیلیون',
|
161
|
+
'اکتیلیارد',
|
162
|
+
'نانیلیون',
|
163
|
+
'نانیلیارد',
|
164
|
+
'دسیلیون',
|
165
|
+
'دسیلیارد'
|
166
|
+
].freeze
|
167
|
+
end
|
163
168
|
end
|