persian 0.1.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,53 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian NumText class
6
+ # Work with number in words
7
+ class NumText < Number
8
+ def self.num_to_char(num, inner = false)
9
+ num = Persian::Number.number(num, lang: 'en', return: 'int') if num.is_a? String
10
+
11
+ words = ''
12
+
13
+ if ((num / 100) >= 0) && ((num / 100) < 10)
14
+
15
+ if num < 10
16
+ if num.zero? && inner
17
+ words += ''
18
+ elsif inner
19
+ words += ' و '
20
+ words += ONES[num]
21
+ else
22
+ words += ONES[num]
23
+ end
24
+
25
+ elsif num < 20
26
+ words += TEENS[num - 10]
27
+ elsif num < 100
28
+ words += ' و ' if inner
29
+
30
+ words += DECIMAL[num / 10] + num_to_char(num % 10, true)
31
+ else
32
+ words += ' و ' if inner
33
+
34
+ words += HUNDREDS[num / 100] + num_to_char(num % 100, true)
35
+ end
36
+ else
37
+ words += ' و ' if inner
38
+
39
+ if (num.to_s.length % 3).nonzero?
40
+ current_split = num.to_s[0..(num.to_s.length % 3 - 1).to_i]
41
+ more_split = num.to_s.sub(current_split, '')
42
+ words += num_to_char(current_split) + ' ' + LONGSCALE[(num.to_s.length / 3)] + num_to_char(more_split, true)
43
+ else
44
+ current_split = num.to_s[0..2]
45
+ more_split = num.to_s.sub(current_split, '')
46
+ words += num_to_char(current_split) + ' ' + LONGSCALE[(num.to_s.length / 3) - 1] + num_to_char(more_split, true)
47
+ end
48
+ end
49
+
50
+ words
51
+ end
52
+ end
53
+ end
@@ -1,32 +1,81 @@
1
1
  # -*- coding: UTF-8 -*-
2
2
 
3
- class Persian
3
+ # Persian module
4
+ module Persian
5
+ # Persian Number class
6
+ # Init an instance with pass a number
7
+ # Basic operators works with persian string of numbers
8
+ class Number
9
+ def initialize(num)
10
+ @value = Persian::Number.number(num)
11
+ end
4
12
 
5
- def self.number num, opts = {:lang => "fa", :return => "string" }
13
+ def +(other)
14
+ Number.to_persian(Number.to_i(@value) + Number.to_i(other))
15
+ end
6
16
 
7
- if num.is_a? Numeric
8
- num = num.to_s
17
+ def -(other)
18
+ Number.to_persian(Number.to_i(@value) - Number.to_i(other))
9
19
  end
10
20
 
11
- if opts[:lang] == "fa" || opts[:lang] == nil
12
- nums = @english_persian_numbers.merge(@arabic_persian_numbers)
13
- elsif opts[:lang] == "en"
14
- nums = @persian_english_numbers.merge(@arabic_english_numbers)
15
- elsif opts[:lang] == "ar"
16
- nums = @persian_arabic_numbers.merge(@english_arabic_numbers)
21
+ def *(other)
22
+ Number.to_persian(Number.to_i(@value) * Number.to_i(other))
17
23
  end
18
24
 
19
- nums.each {|k, v|
20
- num.gsub!(k, v)
21
- }
25
+ def /(other)
26
+ Number.to_persian(Number.to_i(@value) / Number.to_i(other))
27
+ end
28
+
29
+ def self.number(num, opts = { lang: 'fa', return: 'string' })
30
+ opts[:lang] = 'fa' if opts[:lang].nil?
31
+ opts[:return] = 'string' if opts[:return].nil?
32
+ num = num.to_s if num.is_a? Numeric
33
+
34
+ if opts[:lang] == 'fa' || opts[:lang].nil?
35
+ nums = EN_FA_NUM.merge(AR_FA_NUM)
36
+ elsif opts[:lang] == 'en'
37
+ nums = FA_EN_NUM.merge(AR_EN_NUM)
38
+ elsif opts[:lang] == 'ar'
39
+ nums = FA_AR_NUM.merge(EN_AR_NUM)
40
+ end
22
41
 
23
- if opts[:return] == "string"
24
- return num
25
- elsif opts[:return] == "int"
26
- return num.to_i
27
- else
28
- return num
42
+ nums.each { |k, v| num.gsub!(k, v) }
43
+
44
+ if opts[:return]
45
+ case opts[:return]
46
+ when 'string'
47
+ num
48
+ when 'int'
49
+ num.to_i
50
+ end
51
+ else
52
+ num
53
+ end
29
54
  end
30
- end
31
55
 
56
+ def self.number_with_colon(num)
57
+ num = number(num)
58
+ num.reverse.gsub(/(\S{3})(?=\S)/, '\\1,').reverse
59
+ end
60
+
61
+ def self.to_persian(num)
62
+ number(num, lang: 'fa')
63
+ end
64
+
65
+ def self.to_english(num)
66
+ number(num, lang: 'en')
67
+ end
68
+
69
+ def self.to_arabic(num)
70
+ number(num, lang: 'ar')
71
+ end
72
+
73
+ def self.to_i(num)
74
+ number(num, lang: 'en', return: 'int')
75
+ end
76
+
77
+ def self.random(params = nil)
78
+ number(rand(params))
79
+ end
80
+ end
32
81
  end
@@ -0,0 +1,22 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian Text class
6
+ # Digest Persian texts
7
+ class Text
8
+ # Replace english characters with it's key persian value on standard persian keyboard
9
+ # For now just support QWERTY keyboard
10
+ def self.english_to_persian_char(text)
11
+ EN_FA_KEYBOARD_CHAR.each { |k, v| text.gsub!(k, v) }
12
+ text
13
+ end
14
+
15
+ # Replace standard persian keyboard characters with it's key persian value on english keyboard
16
+ # For now just support QWERTY keyboard
17
+ def self.persian_to_english_char(text)
18
+ EN_FA_KEYBOARD_CHAR.each { |v, k| text.gsub!(k, v) }
19
+ text
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,197 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian Text class
6
+ # Digest Persian texts
7
+ class Text
8
+ # Replace Arabic characters with Persian characters.
9
+ def self.character(text)
10
+ AR_FA_CHAR.each { |k, v| text.gsub!(k, v) }
11
+ text
12
+ end
13
+
14
+ # Remove extra spaces in text
15
+ def self.remove_extra_spaces(text)
16
+ text = text.split.join(' ')
17
+ text = text.split('‌').join('‌')
18
+ text
19
+ end
20
+
21
+ # Remove Arabic harecats from text
22
+ def self.remove_harekats(text)
23
+ HAREKATS.each { |v| text.gsub!(v, '') }
24
+ text
25
+ end
26
+
27
+ # Remove All barckets
28
+ def self.remove_brackets(text)
29
+ BRACKETS.each { |v| text.gsub!(v, '') }
30
+ text
31
+ end
32
+
33
+ # Remove Persian signs
34
+ def self.remove_signs(text, with = '')
35
+ SIGNS.each { |v| text.gsub!(v, with) }
36
+ text
37
+ end
38
+
39
+ # Replace general brackets with one type brackets
40
+ # Default: 0xAB & 0xBB
41
+ def self.general_brackets(text, left = '«', right = '»')
42
+ text.gsub!(/"(.*?)"/, left + '\1' + right)
43
+ text.gsub!(/\[(.*?)\]/, left + '\1' + right)
44
+ text.gsub!(/\{(.*?)\}/, left + '\1' + right)
45
+ text.gsub!(/\((.*?)\)/, left + '\1' + right)
46
+ text
47
+ end
48
+
49
+ # Add '‌ی' after names that end with ه, ا, و
50
+ def self.fix_y_after_vowel(text)
51
+ text += '‌ی' if END_VOWEL.include? text[-1]
52
+ text
53
+ end
54
+
55
+ # Replace Space with Zero-width none-joiner after می and نمی
56
+ def self.replace_zwnj_mi(text)
57
+ mi = 'می'
58
+ nmi = 'نمی'
59
+ text.gsub!(/(^|\s)(#{mi}|#{nmi})\s(\S+)/, '\1\2‌\3')
60
+ text
61
+ end
62
+
63
+ # Resplace ست with \sاست if lastest character before \s is ا
64
+ def self.ast(text)
65
+ a = 'ا'
66
+ ast = 'است'
67
+ st = 'ست'
68
+
69
+ text.gsub!(/(#{a})\s(#{ast})/, '\1' + st)
70
+ text
71
+ end
72
+
73
+ # Remove keshide from text
74
+ def self.keshide(text)
75
+ text.gsub!(/ـ+/, '')
76
+ text
77
+ end
78
+
79
+ # Use ی instead of ئ if next char is ی
80
+ # Example پائیز => پاییز
81
+ def self.replace_e_y(text)
82
+ e = 'ئ'
83
+ y = 'ی'
84
+ text.gsub!(/#{e}(#{y})/, '\1\1')
85
+ text
86
+ end
87
+
88
+ def self.three_dots(text)
89
+ text.gsub!(/\.{3,}/, '…')
90
+ text
91
+ end
92
+
93
+ def self.suffix(text)
94
+ tar = 'تر'
95
+ ee = 'ی'
96
+ n = 'ن'
97
+ ha = 'ها'
98
+ ye = 'ی'
99
+ text.gsub!(/\s+(#{tar}(#{ee}(#{n})?)?)|(#{ha}(#{ye})?)\s+/, '‌\1')
100
+ text
101
+ end
102
+
103
+ def self.remove_extra_question_mark(text)
104
+ mark = '؟'
105
+ text.gsub!(/(#{mark}){2,}/, '\1')
106
+ text
107
+ end
108
+
109
+ def self.add_zwnj(text, point)
110
+ text = text.scan(/^.{#{point}}|.+/).join('‌')
111
+ text
112
+ end
113
+
114
+ def self.remove_question_exclamation(text)
115
+ question = '؟'
116
+ exclamation = '!'
117
+ text.gsub!(/(#{question})+(#{exclamation})+/, '\1\2')
118
+ text
119
+ end
120
+
121
+ def self.remove_stopwords(text)
122
+ stopwords = ['و', 'در', 'به', 'این', 'با', 'از', 'که', 'است', 'را']
123
+ words = text.scan(/\S+/)
124
+ keywords = words.select { |word| !stopwords.include?(word) }
125
+ keywords.join(' ')
126
+ end
127
+
128
+ def self.remove_space_noghtevirgool(text)
129
+ noghtevirgool = '؛'
130
+ text.gsub!(/\s+(#{noghtevirgool})/, '\1')
131
+ text
132
+ end
133
+
134
+ def self.remove_signs_after_noghtevirgool(text)
135
+ signs = '[\.،؛:!؟\-…]'
136
+ noghtevirgool = '؛'
137
+ text.gsub!(/(#{noghtevirgool})[#{signs}]+/, '\1')
138
+ text
139
+ end
140
+
141
+ def self.space_after_noghtevirgool(text)
142
+ noghtevirgool = '؛'
143
+ text.gsub!(/(#{noghtevirgool})(\S)/, '\1 \2')
144
+ text
145
+ end
146
+
147
+ def self.remove_noghtevirgool_para_end(text)
148
+ noghtevirgool = '؛'
149
+ text.gsub!(/#{noghtevirgool}(\n|$)/, '.\1')
150
+ text
151
+ end
152
+
153
+ def self.remove_noghtevirgool_baz_start(text)
154
+ noghtevirgool = '؛'
155
+
156
+ regex = /([\(\[«])[ ‌]*[#{noghtevirgool}]/
157
+ text.gsub!(regex, '\1')
158
+ text
159
+ end
160
+
161
+ def self.remove_space_before_virgool(text)
162
+ virgool = '،'
163
+
164
+ text.gsub!(/\s+(#{virgool})/, '\1')
165
+ text
166
+ end
167
+
168
+ def self.remove_signs_after_virgool(text)
169
+ pattern = /(،)([ ‌]+)?([،؛:!؟\-][\.،؛:!؟\-]*|\.(?!\.))/
170
+
171
+ text.gsub!(pattern, '\1\2')
172
+ text
173
+ end
174
+
175
+ def self.space_after_virgool(text)
176
+ virgool = '،'
177
+
178
+ text.gsub!(/(#{virgool})(\S)/, '\1 \2')
179
+ text
180
+ end
181
+
182
+ def self.rm_char(text, char)
183
+ text.gsub!(/(#{char})/, '')
184
+ text
185
+ end
186
+
187
+ def self.rm_virgool_in_end(text)
188
+ text.gsub!(/(،)([ ‌\n]+)?$/, '.\2')
189
+ text
190
+ end
191
+
192
+ def self.space_after_dot(text)
193
+ text.gsub!(/(\.)(\S)/, '\1 \2')
194
+ text
195
+ end
196
+ end
197
+ end
@@ -0,0 +1,42 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian tokenize class
6
+ class Tokenizer
7
+ # Basic persian word tokenizer
8
+ # Return an array of words
9
+ def self.tokenize(text)
10
+ symbols = ['!', '﷼', ':', '؛', '؟', '،', '-', '.']
11
+ pair_pre = ['(', '{', '«', '<', '[']
12
+ pair_post = [')', '}', '»', '>', ']']
13
+ prepost = ["'", '"']
14
+
15
+ # Split text with space characters
16
+ splits = text.split(/\s/)
17
+
18
+ return [''] if splits.empty?
19
+
20
+ options = symbols + pair_pre + pair_post + prepost
21
+
22
+ pattern = /[^#{Regexp.escape(options.join)}]+/
23
+ tokens = []
24
+
25
+ splits.each do |split|
26
+ first, middle, last = split.partition(pattern)
27
+ tokens << first.split unless first.empty?
28
+ tokens << middle unless middle.empty?
29
+ tokens << last.split unless last.empty?
30
+ end
31
+
32
+ tokens.flatten
33
+ end
34
+
35
+ # Split paragraphs
36
+ # Return an array of paragraphs
37
+ def self.split_paragraphs(text)
38
+ text = text.split("\n").reject(&:empty?)
39
+ text
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,42 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian Unicode class
6
+ class Unicode
7
+ def self.codepoint_to_char(char)
8
+ return [char].pack('U') if char.is_a? Fixnum
9
+ [char.hex].pack('U')
10
+ end
11
+
12
+ # Return text between RIGHT-TO-LETF EMBEDDING(U+202B) and Pop Directional Format(U+202C)
13
+ def self.rle(text)
14
+ lre_tag = 0x202B
15
+ pop_tag = 0x202C
16
+
17
+ codepoint_to_char(lre_tag) + text + codepoint_to_char(pop_tag)
18
+ end
19
+
20
+ # Return text between LETF-TO-RIGHT EMBEDDING(U+202A) and Pop Directional Format(U+202C)
21
+ def self.lre(text)
22
+ rle_tag = 0x202A
23
+ pop_tag = 0x202C
24
+
25
+ codepoint_to_char(rle_tag) + text + codepoint_to_char(pop_tag)
26
+ end
27
+
28
+ def self.rlo(text)
29
+ lro_tag = 0x202E
30
+ pop_tag = 0x202C
31
+
32
+ codepoint_to_char(lro_tag) + text + codepoint_to_char(pop_tag)
33
+ end
34
+
35
+ def self.lro(text)
36
+ rlo_tag = 0x202D
37
+ pop_tag = 0x202C
38
+
39
+ codepoint_to_char(rlo_tag) + text + codepoint_to_char(pop_tag)
40
+ end
41
+ end
42
+ end