persian 0.0.0 → 0.2.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,214 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian Text class
6
+ # Digest Persian texts
7
+ class Text
8
+ # Replace Arabic characters with Persian characters.
9
+ def self.character(text)
10
+ AR_FA_CHAR.each { |k, v| text.gsub!(k, v) }
11
+ text
12
+ end
13
+
14
+ # Remove extra spaces in text
15
+ def self.remove_extra_spaces(text)
16
+ text = text.split.join(' ')
17
+ text = text.split('‌').join('‌')
18
+ text
19
+ end
20
+
21
+ # Remove Arabic harecats from text
22
+ def self.remove_harekats(text)
23
+ HAREKATS.each { |v| text = text.gsub(v, '') }
24
+ text
25
+ end
26
+
27
+ # Remove All barckets
28
+ def self.remove_brackets(text)
29
+ BRACKETS.each { |v| text = text.gsub(v, '') }
30
+ text
31
+ end
32
+
33
+ # Remove Persian signs
34
+ def self.remove_signs(text, with = '')
35
+ return '' if text.nil?
36
+ SIGNS.each { |v| text = text.gsub(v, with) }
37
+ text
38
+ end
39
+
40
+ def self.replace_zwnj_with_space(text)
41
+ text = text.gsub(/(‌)/, ' ')
42
+ text
43
+ end
44
+
45
+ # Replace general brackets with one type brackets
46
+ # Default: 0xAB & 0xBB
47
+ def self.general_brackets(text, left = '«', right = '»')
48
+ text = text.gsub(/"(.*?)"/, left + '\1' + right)
49
+ text = text.gsub(/\[(.*?)\]/, left + '\1' + right)
50
+ text = text.gsub(/\{(.*?)\}/, left + '\1' + right)
51
+ text = text.gsub(/\((.*?)\)/, left + '\1' + right)
52
+ text
53
+ end
54
+
55
+ # Add '‌ی' after names that end with ه, ا, و
56
+ def self.fix_y_after_vowel(text)
57
+ text += '‌ی' if END_VOWEL.include? text[-1]
58
+ text
59
+ end
60
+
61
+ # Replace Space with Zero-width none-joiner after می and نمی
62
+ def self.replace_zwnj_mi(text)
63
+ mi = 'می'
64
+ nmi = 'نمی'
65
+ text.gsub!(/(^|\s)(#{mi}|#{nmi})\s(\S+)/, '\1\2‌\3')
66
+ text
67
+ end
68
+
69
+ # Resplace ست with \sاست if lastest character before \s is ا
70
+ def self.ast(text)
71
+ a = 'ا'
72
+ ast = 'است'
73
+ st = 'ست'
74
+
75
+ text.gsub!(/(#{a})\s(#{ast})/, '\1' + st)
76
+ text
77
+ end
78
+
79
+ # Remove keshide from text
80
+ def self.keshide(text)
81
+ text.gsub!(/ـ+/, '')
82
+ text
83
+ end
84
+
85
+ # Use ی instead of ئ if next char is ی
86
+ # Example پائیز => پاییز
87
+ def self.replace_e_y(text)
88
+ e = 'ئ'
89
+ y = 'ی'
90
+ text.gsub!(/#{e}(#{y})/, '\1\1')
91
+ text
92
+ end
93
+
94
+ def self.three_dots(text)
95
+ text.gsub!(/\.{3,}/, '…')
96
+ text
97
+ end
98
+
99
+ def self.suffix(text)
100
+ tar = 'تر'
101
+ ee = 'ی'
102
+ n = 'ن'
103
+ ha = 'ها'
104
+ ye = 'ی'
105
+ text.gsub!(/\s+(#{tar}(#{ee}(#{n})?)?)|(#{ha}(#{ye})?)\s+/, '‌\1')
106
+ text
107
+ end
108
+
109
+ def self.remove_extra_question_mark(text)
110
+ mark = '؟'
111
+ text.gsub!(/(#{mark}){2,}/, '\1')
112
+ text
113
+ end
114
+
115
+ def self.add_zwnj(text, point)
116
+ text = text.scan(/^.{#{point}}|.+/).join('‌')
117
+ text
118
+ end
119
+
120
+ def self.remove_question_exclamation(text)
121
+ question = '؟'
122
+ exclamation = '!'
123
+ text.gsub!(/(#{question})+(#{exclamation})+/, '\1\2')
124
+ text
125
+ end
126
+
127
+ def self.remove_stopwords(text)
128
+ stopwords = ['و', 'در', 'به', 'این', 'با', 'از', 'که', 'است', 'را']
129
+ words = text.scan(/\S+/)
130
+ keywords = words.select { |word| !stopwords.include?(word) }
131
+ keywords.join(' ')
132
+ end
133
+
134
+ def self.remove_space_noghtevirgool(text)
135
+ noghtevirgool = '؛'
136
+ text.gsub!(/\s+(#{noghtevirgool})/, '\1')
137
+ text
138
+ end
139
+
140
+ def self.remove_signs_after_noghtevirgool(text)
141
+ signs = '[\.،؛:!؟\-…]'
142
+ noghtevirgool = '؛'
143
+ text.gsub!(/(#{noghtevirgool})[#{signs}]+/, '\1')
144
+ text
145
+ end
146
+
147
+ def self.space_after_noghtevirgool(text)
148
+ noghtevirgool = '؛'
149
+ text.gsub!(/(#{noghtevirgool})(\S)/, '\1 \2')
150
+ text
151
+ end
152
+
153
+ def self.remove_noghtevirgool_para_end(text)
154
+ noghtevirgool = '؛'
155
+ text.gsub!(/#{noghtevirgool}(\n|$)/, '.\1')
156
+ text
157
+ end
158
+
159
+ def self.remove_noghtevirgool_baz_start(text)
160
+ noghtevirgool = '؛'
161
+
162
+ regex = /([\(\[«])[ ‌]*[#{noghtevirgool}]/
163
+ text.gsub!(regex, '\1')
164
+ text
165
+ end
166
+
167
+ def self.remove_space_before_virgool(text)
168
+ virgool = '،'
169
+
170
+ text.gsub!(/\s+(#{virgool})/, '\1')
171
+ text
172
+ end
173
+
174
+ def self.remove_signs_after_virgool(text)
175
+ pattern = /(،)([ ‌]+)?([،؛:!؟\-][\.،؛:!؟\-]*|\.(?!\.))/
176
+
177
+ text.gsub!(pattern, '\1\2')
178
+ text
179
+ end
180
+
181
+ def self.space_after_virgool(text)
182
+ virgool = '،'
183
+
184
+ text.gsub!(/(#{virgool})(\S)/, '\1 \2')
185
+ text
186
+ end
187
+
188
+ def self.rm_char(text, char)
189
+ text.gsub!(/(#{char})/, '')
190
+ text
191
+ end
192
+
193
+ def self.rm_virgool_in_end(text)
194
+ text.gsub!(/(،)([ ‌\n]+)?$/, '.\2')
195
+ text
196
+ end
197
+
198
+ def self.space_after_dot(text)
199
+ text.gsub!(/(\.)(\S)/, '\1 \2')
200
+ text
201
+ end
202
+
203
+ def self.squeeze(text)
204
+ text.squeeze
205
+ end
206
+
207
+ # Remove specific character from end of text
208
+ # EXample: remove_postfix('پسره','ه')
209
+ def self.remove_postfix(text, postfix)
210
+ text.chomp!(postfix)
211
+ text
212
+ end
213
+ end
214
+ end
@@ -0,0 +1,56 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian tokenize class
6
+ class Tokenizer
7
+ # Basic persian word tokenizer
8
+ # Return an array of words
9
+ def self.tokenize(text)
10
+ symbols = ['!', '﷼', ':', '؛', '؟', '،', '-', '.']
11
+ pair_pre = ['(', '{', '«', '<', '[']
12
+ pair_post = [')', '}', '»', '>', ']']
13
+ prepost = ["'", '"']
14
+
15
+ # Split text with space characters
16
+ splits = text.split(/\s/)
17
+
18
+ return [''] if splits.empty?
19
+
20
+ options = symbols + pair_pre + pair_post + prepost
21
+
22
+ pattern = /[^#{Regexp.escape(options.join)}]+/
23
+ tokens = []
24
+
25
+ splits.each do |split|
26
+ first, middle, last = split.partition(pattern)
27
+ tokens << first.split unless first.empty?
28
+ tokens << middle unless middle.empty?
29
+ tokens << last.split unless last.empty?
30
+ end
31
+
32
+ tokens.flatten
33
+ end
34
+
35
+ def self.tokenize_more(text, num)
36
+ list = tokenize(text)
37
+ tokens = []
38
+ 0.upto list.size - num do |i|
39
+ token = ''
40
+ 0.upto num - 1 do |j|
41
+ token += list[i + j] + ' '
42
+ end
43
+ tokens.push token.strip
44
+ end
45
+
46
+ tokens
47
+ end
48
+
49
+ # Split paragraphs
50
+ # Return an array of paragraphs
51
+ def self.split_paragraphs(text)
52
+ text = text.split("\n").reject(&:empty?)
53
+ text
54
+ end
55
+ end
56
+ end
@@ -0,0 +1,42 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian Unicode class
6
+ class Unicode
7
+ def self.codepoint_to_char(char)
8
+ return [char].pack('U') if char.is_a? Fixnum
9
+ [char.hex].pack('U')
10
+ end
11
+
12
+ # Return text between RIGHT-TO-LETF EMBEDDING(U+202B) and Pop Directional Format(U+202C)
13
+ def self.rle(text)
14
+ lre_tag = 0x202B
15
+ pop_tag = 0x202C
16
+
17
+ codepoint_to_char(lre_tag) + text + codepoint_to_char(pop_tag)
18
+ end
19
+
20
+ # Return text between LETF-TO-RIGHT EMBEDDING(U+202A) and Pop Directional Format(U+202C)
21
+ def self.lre(text)
22
+ rle_tag = 0x202A
23
+ pop_tag = 0x202C
24
+
25
+ codepoint_to_char(rle_tag) + text + codepoint_to_char(pop_tag)
26
+ end
27
+
28
+ def self.rlo(text)
29
+ lro_tag = 0x202E
30
+ pop_tag = 0x202C
31
+
32
+ codepoint_to_char(lro_tag) + text + codepoint_to_char(pop_tag)
33
+ end
34
+
35
+ def self.lro(text)
36
+ rlo_tag = 0x202D
37
+ pop_tag = 0x202C
38
+
39
+ codepoint_to_char(rlo_tag) + text + codepoint_to_char(pop_tag)
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,25 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ # Persian module
4
+ module Persian
5
+ # Persian Unicode class
6
+ class Url
7
+ def self.urlify(text)
8
+ # remove brackets
9
+ text = Text.remove_brackets(text)
10
+ # remove harekats
11
+ text = Text.remove_harekats(text)
12
+ # remove slash and backslash
13
+ text = text.gsub(%r{(\/||\\)}, '')
14
+ # remove signs
15
+ text = Text.remove_signs(text, ' ')
16
+ # Remove extra spaces
17
+ text = Text.remove_extra_spaces(text)
18
+ # trim spaces from start and end of text
19
+ text = text.strip
20
+ # replace space with dash
21
+ text = text.gsub(/\s/, '-')
22
+ text
23
+ end
24
+ end
25
+ end
@@ -1,5 +1,6 @@
1
1
  # -*- coding: UTF-8 -*-
2
2
 
3
+ # Persian module
3
4
  module Persian
4
- VERSION = '0.0.0'
5
+ VERSION = '0.2.2'.freeze
5
6
  end
data/lib/persian.rb CHANGED
@@ -1,42 +1,19 @@
1
1
  # -*- coding: UTF-8 -*-
2
2
 
3
- class Persian
4
- def self.number num
3
+ # lists
4
+ require 'persian/list/alphabet'
5
+ require 'persian/list/number'
6
+ require 'persian/list/character'
7
+ require 'persian/list/homonyms'
5
8
 
6
- if num.is_a? Numeric
7
- num = num.to_s
8
- end
9
-
10
- nums = {
11
- # english numbers
12
- "0" => "۰",
13
- "1" => "۱",
14
- "2" => "۲",
15
- "3" => "۳",
16
- "4" => "۴",
17
- "5" => "۵",
18
- "6" => "۶",
19
- "7" => "۷",
20
- "8" => "۸",
21
- "9" => "۹",
22
-
23
- #arabic numbers
24
- "٠" => "۰",
25
- "١" => "۱",
26
- "٢" => "۲",
27
- "٣" => "۳",
28
- "٤" => "۴",
29
- "٥" => "۵",
30
- "٦" => "۶",
31
- "٧" => "۷",
32
- "٨" => "۸",
33
- "٩" => "۹",
34
- }
35
-
36
- nums.each {|k, v|
37
- num.gsub!(k, v)
38
- }
39
-
40
- return num
41
- end
42
- end
9
+ # classes
10
+ require 'persian/number'
11
+ require 'persian/text/text'
12
+ require 'persian/text/keyboard'
13
+ require 'persian/num_text'
14
+ require 'persian/date'
15
+ require 'persian/tokenizer'
16
+ require 'persian/counter'
17
+ require 'persian/unicode'
18
+ require 'persian/dynamic'
19
+ require 'persian/url'
data/persian.gemspec ADDED
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+
6
+ require 'persian/version'
7
+
8
+ Gem::Specification.new do |s|
9
+ s.name = 'persian'
10
+ s.version = Persian::VERSION
11
+ s.date = '2022-03-25'
12
+ s.summary = 'Persian language for ruby.'
13
+ s.description = 'A set of utilities for Persian language.'
14
+ s.authors = ['Dariush Abbasi']
15
+ s.email = 'poshtehani@gmail.com'
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {spec}/*`.split("\n")
18
+ s.executables =
19
+ `git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
20
+ s.require_paths = ['lib']
21
+ s.homepage =
22
+ 'http://github.com/dariubs/persian.rb'
23
+ s.license = 'MIT'
24
+
25
+ s.add_development_dependency 'rspec', '3.4'
26
+ end
data/readme.md ADDED
@@ -0,0 +1,48 @@
1
+ <p align="center">
2
+ <img src="https://upload.wikimedia.org/wikipedia/commons/a/a2/Farsi.svg"
3
+ height="130" alt="Persian ruby gem">
4
+ </p>
5
+
6
+ <a href="https://travis-ci.org/negah/persian">
7
+ <img src="https://travis-ci.org/negah/persian.svg?branch=master"
8
+ alt="Build Status">
9
+ </a>
10
+
11
+ <a href="https://rubygems.org/gems/persian">
12
+ <img src="https://img.shields.io/badge/gem-persian-orange.svg"
13
+ alt="Ruby Gems">
14
+ </a>
15
+
16
+ <a href="https://rubygems.org/gems/persian">
17
+ <img src="https://img.shields.io/gem/dv/persian/stable.svg?maxAge=2592000"
18
+ alt="Ruby Gems downloads">
19
+ </a>
20
+
21
+ <a href="https://codeclimate.com/github/negah/persian">
22
+ <img src="https://codeclimate.com/github/negah/persian/badges/gpa.svg"
23
+ alt="Code Climate">
24
+ </a>
25
+
26
+ <p align="center"><sup><strong> Ruby gem for working with Persian text. </strong></sup></p>
27
+
28
+
29
+
30
+ Install
31
+ -----
32
+ ```shell
33
+ gem install persian
34
+ ```
35
+
36
+ Usage
37
+ -----
38
+ ```ruby
39
+ require 'persian'
40
+ ```
41
+
42
+ Components
43
+ ----------
44
+ incomplete.
45
+
46
+ License
47
+ -------
48
+ Released under the MIT License.
@@ -0,0 +1,83 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'spec_helper'
4
+
5
+ describe 'persian counter methods' do
6
+ it 'should return a hash of characters with their number of occurrence' do
7
+ before = 'من غلام قمرم غیر قمر هیچ مگو'
8
+ after = {
9
+ 'م' => 6,
10
+ 'ن' => 1,
11
+ ' ' => 6,
12
+ 'غ' => 2,
13
+ 'ل' => 1,
14
+ 'ا' => 1,
15
+ 'ق' => 2,
16
+ 'ر' => 3,
17
+ 'ی' => 2,
18
+ 'ه' => 1,
19
+ 'چ' => 1,
20
+ 'گ' => 1,
21
+ 'و' => 1
22
+ }
23
+ arg = 'غ'
24
+ after_with_arg = 2
25
+
26
+ expect(Persian::Counter.character(before)).to eq(after)
27
+ expect(Persian::Counter.character(before, arg)).to eq(after_with_arg)
28
+ end
29
+
30
+ it 'should return a hash of words as key and number of occurrence of word as value' do
31
+ before = 'پرچم دوران هخامنشی به احتمال زیاد عقابی با بال های گشوده با قرص خورشیدی در پشت سر عقاب بوده است'
32
+ after = {
33
+ 'پرچم' => 1,
34
+ 'دوران' => 1,
35
+ 'هخامنشی' => 1,
36
+ 'به' => 1,
37
+ 'احتمال' => 1,
38
+ 'زیاد' => 1,
39
+ 'عقابی' => 1,
40
+ 'با' => 2,
41
+ 'بال' => 1,
42
+ 'های' => 1,
43
+ 'گشوده' => 1,
44
+ 'قرص' => 1,
45
+ 'خورشیدی' => 1,
46
+ 'در' => 1,
47
+ 'پشت' => 1,
48
+ 'سر' => 1,
49
+ 'عقاب' => 1,
50
+ 'بوده' => 1,
51
+ 'است' => 1
52
+ }
53
+ arg = 'با'
54
+ after_with_arg = 2
55
+
56
+ expect(Persian::Counter.word(before)).to eq(after)
57
+ expect(Persian::Counter.word(before, arg)).to eq(after_with_arg)
58
+ end
59
+
60
+ it 'should return number of paragraphs' do
61
+ text = "
62
+ یوهانس برامس در سال ۱۸۳۳ در شهر هامبورگ آلمان در خانواده‌ای فقیر به دنیا آمد. تحصیلات ابتدایی موسیقی را نزد پدرش که نوازنده کنترباس بود فرا گرفت.
63
+ برامس با ویولونیست‌های مشهوری چون رمنی و یواخیم آشنا شد و در طول این آشنایی بود که رمنی موسیقی محلی مجارستان را به برامس معرفی کرد و تحت تأثیر آن برامس رقص‌های مجار خود را نوشت.
64
+ "
65
+ after = 2
66
+
67
+ expect(Persian::Counter.paragraph(text)).to eq(after)
68
+ end
69
+
70
+ it 'shoud count uniq characters' do
71
+ text = 'دوستت دارم'
72
+ size = 8
73
+
74
+ expect(Persian::Counter.uniq_character(text)).to eq(size)
75
+ end
76
+
77
+ it 'shoud return length of text' do
78
+ text = 'راهی بزن که آهی بر ساز آن توان زد'
79
+ size = 33
80
+
81
+ expect(Persian::Counter.character_counter(text)).to eq(size)
82
+ end
83
+ end
@@ -0,0 +1,6 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'spec_helper'
4
+
5
+ describe 'persian dynamic methods methods' do
6
+ end
@@ -0,0 +1,17 @@
1
+ # -*- coding: UTF-8 -*-
2
+
3
+ require 'spec_helper'
4
+
5
+ describe 'persian number to character methods' do
6
+ it 'should convert english numbers to spelled persian character' do
7
+ before = 1234
8
+ after = 'یک هزار و دویست و سی و چهار'
9
+ expect(Persian::NumText.num_to_char(before)).to eq(after)
10
+ end
11
+
12
+ it 'should convert Persian numbers to spelled persian number' do
13
+ before = '۲۰۴۸۲۰۴۸'
14
+ after = 'بیست میلیون و چهارصد و هشتاد و دو هزار و چهل و هشت'
15
+ expect(Persian::NumText.num_to_char(before)).to eq(after)
16
+ end
17
+ end