persian 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +9 -0
- data/.gitignore +51 -0
- data/.rspec +3 -0
- data/.rubocop.yml +29 -0
- data/.travis.yml +8 -0
- data/Gemfile +10 -0
- data/Rakefile +36 -0
- data/lib/persian.rb +15 -9
- data/lib/persian/counter.rb +61 -0
- data/lib/persian/date.rb +150 -0
- data/lib/persian/dynamic.rb +38 -0
- data/lib/persian/list/alphabet.rb +107 -0
- data/lib/persian/list/character.rb +193 -0
- data/lib/persian/list/number.rb +154 -149
- data/lib/persian/num_text.rb +53 -0
- data/lib/persian/number.rb +69 -20
- data/lib/persian/text/keyboard.rb +22 -0
- data/lib/persian/text/text.rb +197 -0
- data/lib/persian/tokenizer.rb +42 -0
- data/lib/persian/unicode.rb +42 -0
- data/lib/persian/url.rb +25 -0
- data/lib/persian/version.rb +2 -1
- data/persian.gemspec +26 -0
- data/readme.md +47 -0
- data/spec/counter_spec.rb +83 -0
- data/spec/dynamic_spec.rb +6 -0
- data/spec/num_text_spec.rb +17 -0
- data/spec/number_spec.rb +129 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/text_spec.rb +236 -0
- data/spec/tokenizer_spec.rb +23 -0
- data/spec/unicode_spec.rb +25 -0
- data/spec/url_spec.rb +11 -0
- metadata +38 -12
- data/lib/persian/character.rb +0 -26
- data/lib/persian/num_to_char.rb +0 -60
@@ -0,0 +1,53 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian NumText class
|
6
|
+
# Work with number in words
|
7
|
+
class NumText < Number
|
8
|
+
def self.num_to_char(num, inner = false)
|
9
|
+
num = Persian::Number.number(num, lang: 'en', return: 'int') if num.is_a? String
|
10
|
+
|
11
|
+
words = ''
|
12
|
+
|
13
|
+
if ((num / 100) >= 0) && ((num / 100) < 10)
|
14
|
+
|
15
|
+
if num < 10
|
16
|
+
if num.zero? && inner
|
17
|
+
words += ''
|
18
|
+
elsif inner
|
19
|
+
words += ' و '
|
20
|
+
words += ONES[num]
|
21
|
+
else
|
22
|
+
words += ONES[num]
|
23
|
+
end
|
24
|
+
|
25
|
+
elsif num < 20
|
26
|
+
words += TEENS[num - 10]
|
27
|
+
elsif num < 100
|
28
|
+
words += ' و ' if inner
|
29
|
+
|
30
|
+
words += DECIMAL[num / 10] + num_to_char(num % 10, true)
|
31
|
+
else
|
32
|
+
words += ' و ' if inner
|
33
|
+
|
34
|
+
words += HUNDREDS[num / 100] + num_to_char(num % 100, true)
|
35
|
+
end
|
36
|
+
else
|
37
|
+
words += ' و ' if inner
|
38
|
+
|
39
|
+
if (num.to_s.length % 3).nonzero?
|
40
|
+
current_split = num.to_s[0..(num.to_s.length % 3 - 1).to_i]
|
41
|
+
more_split = num.to_s.sub(current_split, '')
|
42
|
+
words += num_to_char(current_split) + ' ' + LONGSCALE[(num.to_s.length / 3)] + num_to_char(more_split, true)
|
43
|
+
else
|
44
|
+
current_split = num.to_s[0..2]
|
45
|
+
more_split = num.to_s.sub(current_split, '')
|
46
|
+
words += num_to_char(current_split) + ' ' + LONGSCALE[(num.to_s.length / 3) - 1] + num_to_char(more_split, true)
|
47
|
+
end
|
48
|
+
end
|
49
|
+
|
50
|
+
words
|
51
|
+
end
|
52
|
+
end
|
53
|
+
end
|
data/lib/persian/number.rb
CHANGED
@@ -1,32 +1,81 @@
|
|
1
1
|
# -*- coding: UTF-8 -*-
|
2
2
|
|
3
|
-
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian Number class
|
6
|
+
# Init an instance with pass a number
|
7
|
+
# Basic operators works with persian string of numbers
|
8
|
+
class Number
|
9
|
+
def initialize(num)
|
10
|
+
@value = Persian::Number.number(num)
|
11
|
+
end
|
4
12
|
|
5
|
-
|
13
|
+
def +(other)
|
14
|
+
Number.to_persian(Number.to_i(@value) + Number.to_i(other))
|
15
|
+
end
|
6
16
|
|
7
|
-
|
8
|
-
|
17
|
+
def -(other)
|
18
|
+
Number.to_persian(Number.to_i(@value) - Number.to_i(other))
|
9
19
|
end
|
10
20
|
|
11
|
-
|
12
|
-
|
13
|
-
elsif opts[:lang] == "en"
|
14
|
-
nums = @persian_english_numbers.merge(@arabic_english_numbers)
|
15
|
-
elsif opts[:lang] == "ar"
|
16
|
-
nums = @persian_arabic_numbers.merge(@english_arabic_numbers)
|
21
|
+
def *(other)
|
22
|
+
Number.to_persian(Number.to_i(@value) * Number.to_i(other))
|
17
23
|
end
|
18
24
|
|
19
|
-
|
20
|
-
|
21
|
-
|
25
|
+
def /(other)
|
26
|
+
Number.to_persian(Number.to_i(@value) / Number.to_i(other))
|
27
|
+
end
|
28
|
+
|
29
|
+
def self.number(num, opts = { lang: 'fa', return: 'string' })
|
30
|
+
opts[:lang] = 'fa' if opts[:lang].nil?
|
31
|
+
opts[:return] = 'string' if opts[:return].nil?
|
32
|
+
num = num.to_s if num.is_a? Numeric
|
33
|
+
|
34
|
+
if opts[:lang] == 'fa' || opts[:lang].nil?
|
35
|
+
nums = EN_FA_NUM.merge(AR_FA_NUM)
|
36
|
+
elsif opts[:lang] == 'en'
|
37
|
+
nums = FA_EN_NUM.merge(AR_EN_NUM)
|
38
|
+
elsif opts[:lang] == 'ar'
|
39
|
+
nums = FA_AR_NUM.merge(EN_AR_NUM)
|
40
|
+
end
|
22
41
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
42
|
+
nums.each { |k, v| num.gsub!(k, v) }
|
43
|
+
|
44
|
+
if opts[:return]
|
45
|
+
case opts[:return]
|
46
|
+
when 'string'
|
47
|
+
num
|
48
|
+
when 'int'
|
49
|
+
num.to_i
|
50
|
+
end
|
51
|
+
else
|
52
|
+
num
|
53
|
+
end
|
29
54
|
end
|
30
|
-
end
|
31
55
|
|
56
|
+
def self.number_with_colon(num)
|
57
|
+
num = number(num)
|
58
|
+
num.reverse.gsub(/(\S{3})(?=\S)/, '\\1,').reverse
|
59
|
+
end
|
60
|
+
|
61
|
+
def self.to_persian(num)
|
62
|
+
number(num, lang: 'fa')
|
63
|
+
end
|
64
|
+
|
65
|
+
def self.to_english(num)
|
66
|
+
number(num, lang: 'en')
|
67
|
+
end
|
68
|
+
|
69
|
+
def self.to_arabic(num)
|
70
|
+
number(num, lang: 'ar')
|
71
|
+
end
|
72
|
+
|
73
|
+
def self.to_i(num)
|
74
|
+
number(num, lang: 'en', return: 'int')
|
75
|
+
end
|
76
|
+
|
77
|
+
def self.random(params = nil)
|
78
|
+
number(rand(params))
|
79
|
+
end
|
80
|
+
end
|
32
81
|
end
|
@@ -0,0 +1,22 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian Text class
|
6
|
+
# Digest Persian texts
|
7
|
+
class Text
|
8
|
+
# Replace english characters with it's key persian value on standard persian keyboard
|
9
|
+
# For now just support QWERTY keyboard
|
10
|
+
def self.english_to_persian_char(text)
|
11
|
+
EN_FA_KEYBOARD_CHAR.each { |k, v| text.gsub!(k, v) }
|
12
|
+
text
|
13
|
+
end
|
14
|
+
|
15
|
+
# Replace standard persian keyboard characters with it's key persian value on english keyboard
|
16
|
+
# For now just support QWERTY keyboard
|
17
|
+
def self.persian_to_english_char(text)
|
18
|
+
EN_FA_KEYBOARD_CHAR.each { |v, k| text.gsub!(k, v) }
|
19
|
+
text
|
20
|
+
end
|
21
|
+
end
|
22
|
+
end
|
@@ -0,0 +1,197 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian Text class
|
6
|
+
# Digest Persian texts
|
7
|
+
class Text
|
8
|
+
# Replace Arabic characters with Persian characters.
|
9
|
+
def self.character(text)
|
10
|
+
AR_FA_CHAR.each { |k, v| text.gsub!(k, v) }
|
11
|
+
text
|
12
|
+
end
|
13
|
+
|
14
|
+
# Remove extra spaces in text
|
15
|
+
def self.remove_extra_spaces(text)
|
16
|
+
text = text.split.join(' ')
|
17
|
+
text = text.split('').join('')
|
18
|
+
text
|
19
|
+
end
|
20
|
+
|
21
|
+
# Remove Arabic harecats from text
|
22
|
+
def self.remove_harekats(text)
|
23
|
+
HAREKATS.each { |v| text.gsub!(v, '') }
|
24
|
+
text
|
25
|
+
end
|
26
|
+
|
27
|
+
# Remove All barckets
|
28
|
+
def self.remove_brackets(text)
|
29
|
+
BRACKETS.each { |v| text.gsub!(v, '') }
|
30
|
+
text
|
31
|
+
end
|
32
|
+
|
33
|
+
# Remove Persian signs
|
34
|
+
def self.remove_signs(text, with = '')
|
35
|
+
SIGNS.each { |v| text.gsub!(v, with) }
|
36
|
+
text
|
37
|
+
end
|
38
|
+
|
39
|
+
# Replace general brackets with one type brackets
|
40
|
+
# Default: 0xAB & 0xBB
|
41
|
+
def self.general_brackets(text, left = '«', right = '»')
|
42
|
+
text.gsub!(/"(.*?)"/, left + '\1' + right)
|
43
|
+
text.gsub!(/\[(.*?)\]/, left + '\1' + right)
|
44
|
+
text.gsub!(/\{(.*?)\}/, left + '\1' + right)
|
45
|
+
text.gsub!(/\((.*?)\)/, left + '\1' + right)
|
46
|
+
text
|
47
|
+
end
|
48
|
+
|
49
|
+
# Add 'ی' after names that end with ه, ا, و
|
50
|
+
def self.fix_y_after_vowel(text)
|
51
|
+
text += 'ی' if END_VOWEL.include? text[-1]
|
52
|
+
text
|
53
|
+
end
|
54
|
+
|
55
|
+
# Replace Space with Zero-width none-joiner after می and نمی
|
56
|
+
def self.replace_zwnj_mi(text)
|
57
|
+
mi = 'می'
|
58
|
+
nmi = 'نمی'
|
59
|
+
text.gsub!(/(^|\s)(#{mi}|#{nmi})\s(\S+)/, '\1\2\3')
|
60
|
+
text
|
61
|
+
end
|
62
|
+
|
63
|
+
# Resplace ست with \sاست if lastest character before \s is ا
|
64
|
+
def self.ast(text)
|
65
|
+
a = 'ا'
|
66
|
+
ast = 'است'
|
67
|
+
st = 'ست'
|
68
|
+
|
69
|
+
text.gsub!(/(#{a})\s(#{ast})/, '\1' + st)
|
70
|
+
text
|
71
|
+
end
|
72
|
+
|
73
|
+
# Remove keshide from text
|
74
|
+
def self.keshide(text)
|
75
|
+
text.gsub!(/ـ+/, '')
|
76
|
+
text
|
77
|
+
end
|
78
|
+
|
79
|
+
# Use ی instead of ئ if next char is ی
|
80
|
+
# Example پائیز => پاییز
|
81
|
+
def self.replace_e_y(text)
|
82
|
+
e = 'ئ'
|
83
|
+
y = 'ی'
|
84
|
+
text.gsub!(/#{e}(#{y})/, '\1\1')
|
85
|
+
text
|
86
|
+
end
|
87
|
+
|
88
|
+
def self.three_dots(text)
|
89
|
+
text.gsub!(/\.{3,}/, '…')
|
90
|
+
text
|
91
|
+
end
|
92
|
+
|
93
|
+
def self.suffix(text)
|
94
|
+
tar = 'تر'
|
95
|
+
ee = 'ی'
|
96
|
+
n = 'ن'
|
97
|
+
ha = 'ها'
|
98
|
+
ye = 'ی'
|
99
|
+
text.gsub!(/\s+(#{tar}(#{ee}(#{n})?)?)|(#{ha}(#{ye})?)\s+/, '\1')
|
100
|
+
text
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.remove_extra_question_mark(text)
|
104
|
+
mark = '؟'
|
105
|
+
text.gsub!(/(#{mark}){2,}/, '\1')
|
106
|
+
text
|
107
|
+
end
|
108
|
+
|
109
|
+
def self.add_zwnj(text, point)
|
110
|
+
text = text.scan(/^.{#{point}}|.+/).join('')
|
111
|
+
text
|
112
|
+
end
|
113
|
+
|
114
|
+
def self.remove_question_exclamation(text)
|
115
|
+
question = '؟'
|
116
|
+
exclamation = '!'
|
117
|
+
text.gsub!(/(#{question})+(#{exclamation})+/, '\1\2')
|
118
|
+
text
|
119
|
+
end
|
120
|
+
|
121
|
+
def self.remove_stopwords(text)
|
122
|
+
stopwords = ['و', 'در', 'به', 'این', 'با', 'از', 'که', 'است', 'را']
|
123
|
+
words = text.scan(/\S+/)
|
124
|
+
keywords = words.select { |word| !stopwords.include?(word) }
|
125
|
+
keywords.join(' ')
|
126
|
+
end
|
127
|
+
|
128
|
+
def self.remove_space_noghtevirgool(text)
|
129
|
+
noghtevirgool = '؛'
|
130
|
+
text.gsub!(/\s+(#{noghtevirgool})/, '\1')
|
131
|
+
text
|
132
|
+
end
|
133
|
+
|
134
|
+
def self.remove_signs_after_noghtevirgool(text)
|
135
|
+
signs = '[\.،؛:!؟\-…]'
|
136
|
+
noghtevirgool = '؛'
|
137
|
+
text.gsub!(/(#{noghtevirgool})[#{signs}]+/, '\1')
|
138
|
+
text
|
139
|
+
end
|
140
|
+
|
141
|
+
def self.space_after_noghtevirgool(text)
|
142
|
+
noghtevirgool = '؛'
|
143
|
+
text.gsub!(/(#{noghtevirgool})(\S)/, '\1 \2')
|
144
|
+
text
|
145
|
+
end
|
146
|
+
|
147
|
+
def self.remove_noghtevirgool_para_end(text)
|
148
|
+
noghtevirgool = '؛'
|
149
|
+
text.gsub!(/#{noghtevirgool}(\n|$)/, '.\1')
|
150
|
+
text
|
151
|
+
end
|
152
|
+
|
153
|
+
def self.remove_noghtevirgool_baz_start(text)
|
154
|
+
noghtevirgool = '؛'
|
155
|
+
|
156
|
+
regex = /([\(\[«])[ ]*[#{noghtevirgool}]/
|
157
|
+
text.gsub!(regex, '\1')
|
158
|
+
text
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.remove_space_before_virgool(text)
|
162
|
+
virgool = '،'
|
163
|
+
|
164
|
+
text.gsub!(/\s+(#{virgool})/, '\1')
|
165
|
+
text
|
166
|
+
end
|
167
|
+
|
168
|
+
def self.remove_signs_after_virgool(text)
|
169
|
+
pattern = /(،)([ ]+)?([،؛:!؟\-][\.،؛:!؟\-]*|\.(?!\.))/
|
170
|
+
|
171
|
+
text.gsub!(pattern, '\1\2')
|
172
|
+
text
|
173
|
+
end
|
174
|
+
|
175
|
+
def self.space_after_virgool(text)
|
176
|
+
virgool = '،'
|
177
|
+
|
178
|
+
text.gsub!(/(#{virgool})(\S)/, '\1 \2')
|
179
|
+
text
|
180
|
+
end
|
181
|
+
|
182
|
+
def self.rm_char(text, char)
|
183
|
+
text.gsub!(/(#{char})/, '')
|
184
|
+
text
|
185
|
+
end
|
186
|
+
|
187
|
+
def self.rm_virgool_in_end(text)
|
188
|
+
text.gsub!(/(،)([ \n]+)?$/, '.\2')
|
189
|
+
text
|
190
|
+
end
|
191
|
+
|
192
|
+
def self.space_after_dot(text)
|
193
|
+
text.gsub!(/(\.)(\S)/, '\1 \2')
|
194
|
+
text
|
195
|
+
end
|
196
|
+
end
|
197
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian tokenize class
|
6
|
+
class Tokenizer
|
7
|
+
# Basic persian word tokenizer
|
8
|
+
# Return an array of words
|
9
|
+
def self.tokenize(text)
|
10
|
+
symbols = ['!', '﷼', ':', '؛', '؟', '،', '-', '.']
|
11
|
+
pair_pre = ['(', '{', '«', '<', '[']
|
12
|
+
pair_post = [')', '}', '»', '>', ']']
|
13
|
+
prepost = ["'", '"']
|
14
|
+
|
15
|
+
# Split text with space characters
|
16
|
+
splits = text.split(/\s/)
|
17
|
+
|
18
|
+
return [''] if splits.empty?
|
19
|
+
|
20
|
+
options = symbols + pair_pre + pair_post + prepost
|
21
|
+
|
22
|
+
pattern = /[^#{Regexp.escape(options.join)}]+/
|
23
|
+
tokens = []
|
24
|
+
|
25
|
+
splits.each do |split|
|
26
|
+
first, middle, last = split.partition(pattern)
|
27
|
+
tokens << first.split unless first.empty?
|
28
|
+
tokens << middle unless middle.empty?
|
29
|
+
tokens << last.split unless last.empty?
|
30
|
+
end
|
31
|
+
|
32
|
+
tokens.flatten
|
33
|
+
end
|
34
|
+
|
35
|
+
# Split paragraphs
|
36
|
+
# Return an array of paragraphs
|
37
|
+
def self.split_paragraphs(text)
|
38
|
+
text = text.split("\n").reject(&:empty?)
|
39
|
+
text
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|
@@ -0,0 +1,42 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
# Persian module
|
4
|
+
module Persian
|
5
|
+
# Persian Unicode class
|
6
|
+
class Unicode
|
7
|
+
def self.codepoint_to_char(char)
|
8
|
+
return [char].pack('U') if char.is_a? Fixnum
|
9
|
+
[char.hex].pack('U')
|
10
|
+
end
|
11
|
+
|
12
|
+
# Return text between RIGHT-TO-LETF EMBEDDING(U+202B) and Pop Directional Format(U+202C)
|
13
|
+
def self.rle(text)
|
14
|
+
lre_tag = 0x202B
|
15
|
+
pop_tag = 0x202C
|
16
|
+
|
17
|
+
codepoint_to_char(lre_tag) + text + codepoint_to_char(pop_tag)
|
18
|
+
end
|
19
|
+
|
20
|
+
# Return text between LETF-TO-RIGHT EMBEDDING(U+202A) and Pop Directional Format(U+202C)
|
21
|
+
def self.lre(text)
|
22
|
+
rle_tag = 0x202A
|
23
|
+
pop_tag = 0x202C
|
24
|
+
|
25
|
+
codepoint_to_char(rle_tag) + text + codepoint_to_char(pop_tag)
|
26
|
+
end
|
27
|
+
|
28
|
+
def self.rlo(text)
|
29
|
+
lro_tag = 0x202E
|
30
|
+
pop_tag = 0x202C
|
31
|
+
|
32
|
+
codepoint_to_char(lro_tag) + text + codepoint_to_char(pop_tag)
|
33
|
+
end
|
34
|
+
|
35
|
+
def self.lro(text)
|
36
|
+
rlo_tag = 0x202D
|
37
|
+
pop_tag = 0x202C
|
38
|
+
|
39
|
+
codepoint_to_char(rlo_tag) + text + codepoint_to_char(pop_tag)
|
40
|
+
end
|
41
|
+
end
|
42
|
+
end
|