persian 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +5 -5
- data/.rubocop.yml +3 -0
- data/lib/persian/list/homonyms.rb +59 -0
- data/lib/persian/text/text.rb +24 -7
- data/lib/persian/tokenizer.rb +14 -0
- data/lib/persian/url.rb +2 -2
- data/lib/persian/version.rb +1 -1
- data/lib/persian.rb +1 -0
- data/persian.gemspec +2 -2
- data/readme.md +22 -21
- data/spec/text_spec.rb +22 -0
- data/spec/tokenizer_spec.rb +8 -0
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ad07e72f8e952adecef3078d3132de48372e936682eaf67dfc77745d863d24d5
|
4
|
+
data.tar.gz: 618f5b540864a034b4fc5bae2865a1c6de8f0fff50fb12eb1c28e4a506062d06
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5dd769632abf8da06746802aedbcfc83a3bb49edfec68364f068c476454469b38ba8e91a0ea721026202d025714a88ee4a969d76ea928f88b0a48af134cb6f71
|
7
|
+
data.tar.gz: ad6bea9eee317516acfe91241137896f474877531ef34c4b69fe1e297b7ca346757e88b9baf7e67dc16b01b9ddc86994900ae1b442ac2325cf59cc909c955c9b
|
data/.rubocop.yml
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
module Persian
|
2
|
+
# Homonyms of persian
|
3
|
+
module Homonyms
|
4
|
+
include Alphabet
|
5
|
+
|
6
|
+
T = [
|
7
|
+
TE,
|
8
|
+
TA
|
9
|
+
].freeze
|
10
|
+
|
11
|
+
S = [
|
12
|
+
THE,
|
13
|
+
SIN,
|
14
|
+
SAD
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
H = [
|
18
|
+
HE_JIMI,
|
19
|
+
HE_DOCHESHM
|
20
|
+
].freeze
|
21
|
+
|
22
|
+
Z = [
|
23
|
+
ZAL,
|
24
|
+
ZE,
|
25
|
+
ZA,
|
26
|
+
ZAD
|
27
|
+
].freeze
|
28
|
+
|
29
|
+
GH = [
|
30
|
+
GHEIN,
|
31
|
+
QAF
|
32
|
+
].freeze
|
33
|
+
|
34
|
+
# List of all Homonyms classified in a hash
|
35
|
+
ALL = {
|
36
|
+
T: T,
|
37
|
+
S: S,
|
38
|
+
H: H,
|
39
|
+
Z: Z,
|
40
|
+
GH: GH
|
41
|
+
}.freeze
|
42
|
+
|
43
|
+
# List of all Homonyms bulk in array
|
44
|
+
ALL_a = [
|
45
|
+
T, S, H, Z, GH
|
46
|
+
].flatten.freeze
|
47
|
+
|
48
|
+
# Hash reverse list of Homonyms
|
49
|
+
temp = {}
|
50
|
+
|
51
|
+
ALL.each do |key, value|
|
52
|
+
value.each do |i|
|
53
|
+
temp[i.to_s] = key
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
ALL_r = temp.freeze
|
58
|
+
end
|
59
|
+
end
|
data/lib/persian/text/text.rb
CHANGED
@@ -20,29 +20,35 @@ module Persian
|
|
20
20
|
|
21
21
|
# Remove Arabic harecats from text
|
22
22
|
def self.remove_harekats(text)
|
23
|
-
HAREKATS.each { |v| text.gsub
|
23
|
+
HAREKATS.each { |v| text = text.gsub(v, '') }
|
24
24
|
text
|
25
25
|
end
|
26
26
|
|
27
27
|
# Remove All barckets
|
28
28
|
def self.remove_brackets(text)
|
29
|
-
BRACKETS.each { |v| text.gsub
|
29
|
+
BRACKETS.each { |v| text = text.gsub(v, '') }
|
30
30
|
text
|
31
31
|
end
|
32
32
|
|
33
33
|
# Remove Persian signs
|
34
34
|
def self.remove_signs(text, with = '')
|
35
|
-
|
35
|
+
return '' if text.nil?
|
36
|
+
SIGNS.each { |v| text = text.gsub(v, with) }
|
37
|
+
text
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.replace_zwnj_with_space(text)
|
41
|
+
text = text.gsub(/()/, ' ')
|
36
42
|
text
|
37
43
|
end
|
38
44
|
|
39
45
|
# Replace general brackets with one type brackets
|
40
46
|
# Default: 0xAB & 0xBB
|
41
47
|
def self.general_brackets(text, left = '«', right = '»')
|
42
|
-
text.gsub
|
43
|
-
text.gsub
|
44
|
-
text.gsub
|
45
|
-
text.gsub
|
48
|
+
text = text.gsub(/"(.*?)"/, left + '\1' + right)
|
49
|
+
text = text.gsub(/\[(.*?)\]/, left + '\1' + right)
|
50
|
+
text = text.gsub(/\{(.*?)\}/, left + '\1' + right)
|
51
|
+
text = text.gsub(/\((.*?)\)/, left + '\1' + right)
|
46
52
|
text
|
47
53
|
end
|
48
54
|
|
@@ -193,5 +199,16 @@ module Persian
|
|
193
199
|
text.gsub!(/(\.)(\S)/, '\1 \2')
|
194
200
|
text
|
195
201
|
end
|
202
|
+
|
203
|
+
def self.squeeze(text)
|
204
|
+
text.squeeze
|
205
|
+
end
|
206
|
+
|
207
|
+
# Remove specific character from end of text
|
208
|
+
# EXample: remove_postfix('پسره','ه')
|
209
|
+
def self.remove_postfix(text, postfix)
|
210
|
+
text.chomp!(postfix)
|
211
|
+
text
|
212
|
+
end
|
196
213
|
end
|
197
214
|
end
|
data/lib/persian/tokenizer.rb
CHANGED
@@ -32,6 +32,20 @@ module Persian
|
|
32
32
|
tokens.flatten
|
33
33
|
end
|
34
34
|
|
35
|
+
def self.tokenize_more(text, num)
|
36
|
+
list = tokenize(text)
|
37
|
+
tokens = []
|
38
|
+
0.upto list.size - num do |i|
|
39
|
+
token = ''
|
40
|
+
0.upto num - 1 do |j|
|
41
|
+
token += list[i + j] + ' '
|
42
|
+
end
|
43
|
+
tokens.push token.strip
|
44
|
+
end
|
45
|
+
|
46
|
+
tokens
|
47
|
+
end
|
48
|
+
|
35
49
|
# Split paragraphs
|
36
50
|
# Return an array of paragraphs
|
37
51
|
def self.split_paragraphs(text)
|
data/lib/persian/url.rb
CHANGED
@@ -10,7 +10,7 @@ module Persian
|
|
10
10
|
# remove harekats
|
11
11
|
text = Text.remove_harekats(text)
|
12
12
|
# remove slash and backslash
|
13
|
-
text.gsub
|
13
|
+
text = text.gsub(%r{(\/||\\)}, '')
|
14
14
|
# remove signs
|
15
15
|
text = Text.remove_signs(text, ' ')
|
16
16
|
# Remove extra spaces
|
@@ -18,7 +18,7 @@ module Persian
|
|
18
18
|
# trim spaces from start and end of text
|
19
19
|
text = text.strip
|
20
20
|
# replace space with dash
|
21
|
-
text.gsub
|
21
|
+
text = text.gsub(/\s/, '-')
|
22
22
|
text
|
23
23
|
end
|
24
24
|
end
|
data/lib/persian/version.rb
CHANGED
data/lib/persian.rb
CHANGED
data/persian.gemspec
CHANGED
@@ -8,7 +8,7 @@ require 'persian/version'
|
|
8
8
|
Gem::Specification.new do |s|
|
9
9
|
s.name = 'persian'
|
10
10
|
s.version = Persian::VERSION
|
11
|
-
s.date = '
|
11
|
+
s.date = '2022-03-25'
|
12
12
|
s.summary = 'Persian language for ruby.'
|
13
13
|
s.description = 'A set of utilities for Persian language.'
|
14
14
|
s.authors = ['Dariush Abbasi']
|
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
`git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
20
20
|
s.require_paths = ['lib']
|
21
21
|
s.homepage =
|
22
|
-
'http://github.com/
|
22
|
+
'http://github.com/dariubs/persian.rb'
|
23
23
|
s.license = 'MIT'
|
24
24
|
|
25
25
|
s.add_development_dependency 'rspec', '3.4'
|
data/readme.md
CHANGED
@@ -1,27 +1,28 @@
|
|
1
1
|
<p align="center">
|
2
2
|
<img src="https://upload.wikimedia.org/wikipedia/commons/a/a2/Farsi.svg"
|
3
|
-
height="130" alt="Persian">
|
4
|
-
</p>
|
5
|
-
<p align="center">
|
6
|
-
<a href="https://travis-ci.org/negah/persian">
|
7
|
-
<img src="https://travis-ci.org/negah/persian.svg?branch=master"
|
8
|
-
alt="Build Status">
|
9
|
-
</a>
|
10
|
-
<a href="https://rubygems.org/gems/persian">
|
11
|
-
<img src="https://img.shields.io/badge/gem-persian-orange.svg"
|
12
|
-
alt="Ruby Gems">
|
13
|
-
</a>
|
14
|
-
|
15
|
-
<a href="https://rubygems.org/gems/persian">
|
16
|
-
<img src="https://img.shields.io/gem/dv/persian/stable.svg?maxAge=2592000"
|
17
|
-
alt="Ruby Gems downloads">
|
18
|
-
</a>
|
19
|
-
|
20
|
-
<a href="https://codeclimate.com/github/negah/persian">
|
21
|
-
<img src="https://codeclimate.com/github/negah/persian/badges/gpa.svg"
|
22
|
-
alt="Code Climate">
|
23
|
-
</a>
|
3
|
+
height="130" alt="Persian ruby gem">
|
24
4
|
</p>
|
5
|
+
|
6
|
+
<a href="https://travis-ci.org/negah/persian">
|
7
|
+
<img src="https://travis-ci.org/negah/persian.svg?branch=master"
|
8
|
+
alt="Build Status">
|
9
|
+
</a>
|
10
|
+
|
11
|
+
<a href="https://rubygems.org/gems/persian">
|
12
|
+
<img src="https://img.shields.io/badge/gem-persian-orange.svg"
|
13
|
+
alt="Ruby Gems">
|
14
|
+
</a>
|
15
|
+
|
16
|
+
<a href="https://rubygems.org/gems/persian">
|
17
|
+
<img src="https://img.shields.io/gem/dv/persian/stable.svg?maxAge=2592000"
|
18
|
+
alt="Ruby Gems downloads">
|
19
|
+
</a>
|
20
|
+
|
21
|
+
<a href="https://codeclimate.com/github/negah/persian">
|
22
|
+
<img src="https://codeclimate.com/github/negah/persian/badges/gpa.svg"
|
23
|
+
alt="Code Climate">
|
24
|
+
</a>
|
25
|
+
|
25
26
|
<p align="center"><sup><strong> Ruby gem for working with Persian text. </strong></sup></p>
|
26
27
|
|
27
28
|
|
data/spec/text_spec.rb
CHANGED
@@ -33,6 +33,13 @@ describe 'persian character methods' do
|
|
33
33
|
expect(Persian::Text.remove_signs(before)).to eq(after)
|
34
34
|
end
|
35
35
|
|
36
|
+
it 'should replace all zwnjs with space ' do
|
37
|
+
before = 'مندرنیمفاصلهاتاسیرم'
|
38
|
+
after = 'من در نیم فاصله ات اسیرم'
|
39
|
+
|
40
|
+
expect(Persian::Text.replace_zwnj_with_space(before)).to eq(after)
|
41
|
+
end
|
42
|
+
|
36
43
|
it 'should replace [ & ], { & }, ( & ), " & " with « & »' do
|
37
44
|
before_first = 'اگر اراده ای نباشد عشقی نیست. "گاندی"'
|
38
45
|
after_first = 'اگر اراده ای نباشد عشقی نیست. «گاندی»'
|
@@ -233,4 +240,19 @@ describe 'persian character methods' do
|
|
233
240
|
after = 'سلام. اسپیس کو؟'
|
234
241
|
expect(Persian::Text.space_after_dot(text)).to eq(after)
|
235
242
|
end
|
243
|
+
|
244
|
+
it 'should remove all repeated characters from text' do
|
245
|
+
text = 'سلااااااام.چی میکنییی؟؟؟؟؟'
|
246
|
+
after = 'سلام.چی میکنی؟'
|
247
|
+
|
248
|
+
expect(Persian::Text.squeeze(text)).to eq(after)
|
249
|
+
end
|
250
|
+
|
251
|
+
it 'should remove text postfix' do
|
252
|
+
text = 'پسره'
|
253
|
+
postfix = 'ه'
|
254
|
+
result = 'پسر'
|
255
|
+
|
256
|
+
expect(Persian::Text.remove_postfix(text, postfix)).to eq(result)
|
257
|
+
end
|
236
258
|
end
|
data/spec/tokenizer_spec.rb
CHANGED
@@ -10,6 +10,14 @@ describe 'persian tokenizers' do
|
|
10
10
|
expect(Persian::Tokenizer.tokenize(before)).to eq(after)
|
11
11
|
end
|
12
12
|
|
13
|
+
it 'should tokenize as a serie' do
|
14
|
+
text = 'سلام من به تو یار قدیمی'
|
15
|
+
parts = 3
|
16
|
+
result = ['سلام من به', 'من به تو', 'به تو یار', 'تو یار قدیمی']
|
17
|
+
|
18
|
+
expect(Persian::Tokenizer.tokenize_more(text, parts)).to eq(result)
|
19
|
+
end
|
20
|
+
|
13
21
|
it 'should split paragraphs' do
|
14
22
|
text = "
|
15
23
|
یوهانس برامس در سال ۱۸۳۳ در شهر هامبورگ آلمان در خانوادهای فقیر به دنیا آمد. تحصیلات ابتدایی موسیقی را نزد پدرش که نوازنده کنترباس بود فرا گرفت.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: persian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dariush Abbasi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -43,6 +43,7 @@ files:
|
|
43
43
|
- lib/persian/dynamic.rb
|
44
44
|
- lib/persian/list/alphabet.rb
|
45
45
|
- lib/persian/list/character.rb
|
46
|
+
- lib/persian/list/homonyms.rb
|
46
47
|
- lib/persian/list/number.rb
|
47
48
|
- lib/persian/num_text.rb
|
48
49
|
- lib/persian/number.rb
|
@@ -63,7 +64,7 @@ files:
|
|
63
64
|
- spec/tokenizer_spec.rb
|
64
65
|
- spec/unicode_spec.rb
|
65
66
|
- spec/url_spec.rb
|
66
|
-
homepage: http://github.com/
|
67
|
+
homepage: http://github.com/dariubs/persian.rb
|
67
68
|
licenses:
|
68
69
|
- MIT
|
69
70
|
metadata: {}
|
@@ -82,8 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
83
|
- !ruby/object:Gem::Version
|
83
84
|
version: '0'
|
84
85
|
requirements: []
|
85
|
-
|
86
|
-
rubygems_version: 2.5.1
|
86
|
+
rubygems_version: 3.2.5
|
87
87
|
signing_key:
|
88
88
|
specification_version: 4
|
89
89
|
summary: Persian language for ruby.
|