persian 0.2.1 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +5 -5
- data/.rubocop.yml +3 -0
- data/lib/persian/list/homonyms.rb +59 -0
- data/lib/persian/text/text.rb +24 -7
- data/lib/persian/tokenizer.rb +14 -0
- data/lib/persian/url.rb +2 -2
- data/lib/persian/version.rb +1 -1
- data/lib/persian.rb +1 -0
- data/persian.gemspec +2 -2
- data/readme.md +22 -21
- data/spec/text_spec.rb +22 -0
- data/spec/tokenizer_spec.rb +8 -0
- metadata +5 -5
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
|
-
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: ad07e72f8e952adecef3078d3132de48372e936682eaf67dfc77745d863d24d5
|
4
|
+
data.tar.gz: 618f5b540864a034b4fc5bae2865a1c6de8f0fff50fb12eb1c28e4a506062d06
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 5dd769632abf8da06746802aedbcfc83a3bb49edfec68364f068c476454469b38ba8e91a0ea721026202d025714a88ee4a969d76ea928f88b0a48af134cb6f71
|
7
|
+
data.tar.gz: ad6bea9eee317516acfe91241137896f474877531ef34c4b69fe1e297b7ca346757e88b9baf7e67dc16b01b9ddc86994900ae1b442ac2325cf59cc909c955c9b
|
data/.rubocop.yml
CHANGED
@@ -0,0 +1,59 @@
|
|
1
|
+
module Persian
|
2
|
+
# Homonyms of persian
|
3
|
+
module Homonyms
|
4
|
+
include Alphabet
|
5
|
+
|
6
|
+
T = [
|
7
|
+
TE,
|
8
|
+
TA
|
9
|
+
].freeze
|
10
|
+
|
11
|
+
S = [
|
12
|
+
THE,
|
13
|
+
SIN,
|
14
|
+
SAD
|
15
|
+
].freeze
|
16
|
+
|
17
|
+
H = [
|
18
|
+
HE_JIMI,
|
19
|
+
HE_DOCHESHM
|
20
|
+
].freeze
|
21
|
+
|
22
|
+
Z = [
|
23
|
+
ZAL,
|
24
|
+
ZE,
|
25
|
+
ZA,
|
26
|
+
ZAD
|
27
|
+
].freeze
|
28
|
+
|
29
|
+
GH = [
|
30
|
+
GHEIN,
|
31
|
+
QAF
|
32
|
+
].freeze
|
33
|
+
|
34
|
+
# List of all Homonyms classified in a hash
|
35
|
+
ALL = {
|
36
|
+
T: T,
|
37
|
+
S: S,
|
38
|
+
H: H,
|
39
|
+
Z: Z,
|
40
|
+
GH: GH
|
41
|
+
}.freeze
|
42
|
+
|
43
|
+
# List of all Homonyms bulk in array
|
44
|
+
ALL_a = [
|
45
|
+
T, S, H, Z, GH
|
46
|
+
].flatten.freeze
|
47
|
+
|
48
|
+
# Hash reverse list of Homonyms
|
49
|
+
temp = {}
|
50
|
+
|
51
|
+
ALL.each do |key, value|
|
52
|
+
value.each do |i|
|
53
|
+
temp[i.to_s] = key
|
54
|
+
end
|
55
|
+
end
|
56
|
+
|
57
|
+
ALL_r = temp.freeze
|
58
|
+
end
|
59
|
+
end
|
data/lib/persian/text/text.rb
CHANGED
@@ -20,29 +20,35 @@ module Persian
|
|
20
20
|
|
21
21
|
# Remove Arabic harecats from text
|
22
22
|
def self.remove_harekats(text)
|
23
|
-
HAREKATS.each { |v| text.gsub
|
23
|
+
HAREKATS.each { |v| text = text.gsub(v, '') }
|
24
24
|
text
|
25
25
|
end
|
26
26
|
|
27
27
|
# Remove All barckets
|
28
28
|
def self.remove_brackets(text)
|
29
|
-
BRACKETS.each { |v| text.gsub
|
29
|
+
BRACKETS.each { |v| text = text.gsub(v, '') }
|
30
30
|
text
|
31
31
|
end
|
32
32
|
|
33
33
|
# Remove Persian signs
|
34
34
|
def self.remove_signs(text, with = '')
|
35
|
-
|
35
|
+
return '' if text.nil?
|
36
|
+
SIGNS.each { |v| text = text.gsub(v, with) }
|
37
|
+
text
|
38
|
+
end
|
39
|
+
|
40
|
+
def self.replace_zwnj_with_space(text)
|
41
|
+
text = text.gsub(/()/, ' ')
|
36
42
|
text
|
37
43
|
end
|
38
44
|
|
39
45
|
# Replace general brackets with one type brackets
|
40
46
|
# Default: 0xAB & 0xBB
|
41
47
|
def self.general_brackets(text, left = '«', right = '»')
|
42
|
-
text.gsub
|
43
|
-
text.gsub
|
44
|
-
text.gsub
|
45
|
-
text.gsub
|
48
|
+
text = text.gsub(/"(.*?)"/, left + '\1' + right)
|
49
|
+
text = text.gsub(/\[(.*?)\]/, left + '\1' + right)
|
50
|
+
text = text.gsub(/\{(.*?)\}/, left + '\1' + right)
|
51
|
+
text = text.gsub(/\((.*?)\)/, left + '\1' + right)
|
46
52
|
text
|
47
53
|
end
|
48
54
|
|
@@ -193,5 +199,16 @@ module Persian
|
|
193
199
|
text.gsub!(/(\.)(\S)/, '\1 \2')
|
194
200
|
text
|
195
201
|
end
|
202
|
+
|
203
|
+
def self.squeeze(text)
|
204
|
+
text.squeeze
|
205
|
+
end
|
206
|
+
|
207
|
+
# Remove specific character from end of text
|
208
|
+
# EXample: remove_postfix('پسره','ه')
|
209
|
+
def self.remove_postfix(text, postfix)
|
210
|
+
text.chomp!(postfix)
|
211
|
+
text
|
212
|
+
end
|
196
213
|
end
|
197
214
|
end
|
data/lib/persian/tokenizer.rb
CHANGED
@@ -32,6 +32,20 @@ module Persian
|
|
32
32
|
tokens.flatten
|
33
33
|
end
|
34
34
|
|
35
|
+
def self.tokenize_more(text, num)
|
36
|
+
list = tokenize(text)
|
37
|
+
tokens = []
|
38
|
+
0.upto list.size - num do |i|
|
39
|
+
token = ''
|
40
|
+
0.upto num - 1 do |j|
|
41
|
+
token += list[i + j] + ' '
|
42
|
+
end
|
43
|
+
tokens.push token.strip
|
44
|
+
end
|
45
|
+
|
46
|
+
tokens
|
47
|
+
end
|
48
|
+
|
35
49
|
# Split paragraphs
|
36
50
|
# Return an array of paragraphs
|
37
51
|
def self.split_paragraphs(text)
|
data/lib/persian/url.rb
CHANGED
@@ -10,7 +10,7 @@ module Persian
|
|
10
10
|
# remove harekats
|
11
11
|
text = Text.remove_harekats(text)
|
12
12
|
# remove slash and backslash
|
13
|
-
text.gsub
|
13
|
+
text = text.gsub(%r{(\/||\\)}, '')
|
14
14
|
# remove signs
|
15
15
|
text = Text.remove_signs(text, ' ')
|
16
16
|
# Remove extra spaces
|
@@ -18,7 +18,7 @@ module Persian
|
|
18
18
|
# trim spaces from start and end of text
|
19
19
|
text = text.strip
|
20
20
|
# replace space with dash
|
21
|
-
text.gsub
|
21
|
+
text = text.gsub(/\s/, '-')
|
22
22
|
text
|
23
23
|
end
|
24
24
|
end
|
data/lib/persian/version.rb
CHANGED
data/lib/persian.rb
CHANGED
data/persian.gemspec
CHANGED
@@ -8,7 +8,7 @@ require 'persian/version'
|
|
8
8
|
Gem::Specification.new do |s|
|
9
9
|
s.name = 'persian'
|
10
10
|
s.version = Persian::VERSION
|
11
|
-
s.date = '
|
11
|
+
s.date = '2022-03-25'
|
12
12
|
s.summary = 'Persian language for ruby.'
|
13
13
|
s.description = 'A set of utilities for Persian language.'
|
14
14
|
s.authors = ['Dariush Abbasi']
|
@@ -19,7 +19,7 @@ Gem::Specification.new do |s|
|
|
19
19
|
`git ls-files -- bin/*`.split("\n").map { |f| File.basename(f) }
|
20
20
|
s.require_paths = ['lib']
|
21
21
|
s.homepage =
|
22
|
-
'http://github.com/
|
22
|
+
'http://github.com/dariubs/persian.rb'
|
23
23
|
s.license = 'MIT'
|
24
24
|
|
25
25
|
s.add_development_dependency 'rspec', '3.4'
|
data/readme.md
CHANGED
@@ -1,27 +1,28 @@
|
|
1
1
|
<p align="center">
|
2
2
|
<img src="https://upload.wikimedia.org/wikipedia/commons/a/a2/Farsi.svg"
|
3
|
-
height="130" alt="Persian">
|
4
|
-
</p>
|
5
|
-
<p align="center">
|
6
|
-
<a href="https://travis-ci.org/negah/persian">
|
7
|
-
<img src="https://travis-ci.org/negah/persian.svg?branch=master"
|
8
|
-
alt="Build Status">
|
9
|
-
</a>
|
10
|
-
<a href="https://rubygems.org/gems/persian">
|
11
|
-
<img src="https://img.shields.io/badge/gem-persian-orange.svg"
|
12
|
-
alt="Ruby Gems">
|
13
|
-
</a>
|
14
|
-
|
15
|
-
<a href="https://rubygems.org/gems/persian">
|
16
|
-
<img src="https://img.shields.io/gem/dv/persian/stable.svg?maxAge=2592000"
|
17
|
-
alt="Ruby Gems downloads">
|
18
|
-
</a>
|
19
|
-
|
20
|
-
<a href="https://codeclimate.com/github/negah/persian">
|
21
|
-
<img src="https://codeclimate.com/github/negah/persian/badges/gpa.svg"
|
22
|
-
alt="Code Climate">
|
23
|
-
</a>
|
3
|
+
height="130" alt="Persian ruby gem">
|
24
4
|
</p>
|
5
|
+
|
6
|
+
<a href="https://travis-ci.org/negah/persian">
|
7
|
+
<img src="https://travis-ci.org/negah/persian.svg?branch=master"
|
8
|
+
alt="Build Status">
|
9
|
+
</a>
|
10
|
+
|
11
|
+
<a href="https://rubygems.org/gems/persian">
|
12
|
+
<img src="https://img.shields.io/badge/gem-persian-orange.svg"
|
13
|
+
alt="Ruby Gems">
|
14
|
+
</a>
|
15
|
+
|
16
|
+
<a href="https://rubygems.org/gems/persian">
|
17
|
+
<img src="https://img.shields.io/gem/dv/persian/stable.svg?maxAge=2592000"
|
18
|
+
alt="Ruby Gems downloads">
|
19
|
+
</a>
|
20
|
+
|
21
|
+
<a href="https://codeclimate.com/github/negah/persian">
|
22
|
+
<img src="https://codeclimate.com/github/negah/persian/badges/gpa.svg"
|
23
|
+
alt="Code Climate">
|
24
|
+
</a>
|
25
|
+
|
25
26
|
<p align="center"><sup><strong> Ruby gem for working with Persian text. </strong></sup></p>
|
26
27
|
|
27
28
|
|
data/spec/text_spec.rb
CHANGED
@@ -33,6 +33,13 @@ describe 'persian character methods' do
|
|
33
33
|
expect(Persian::Text.remove_signs(before)).to eq(after)
|
34
34
|
end
|
35
35
|
|
36
|
+
it 'should replace all zwnjs with space ' do
|
37
|
+
before = 'مندرنیمفاصلهاتاسیرم'
|
38
|
+
after = 'من در نیم فاصله ات اسیرم'
|
39
|
+
|
40
|
+
expect(Persian::Text.replace_zwnj_with_space(before)).to eq(after)
|
41
|
+
end
|
42
|
+
|
36
43
|
it 'should replace [ & ], { & }, ( & ), " & " with « & »' do
|
37
44
|
before_first = 'اگر اراده ای نباشد عشقی نیست. "گاندی"'
|
38
45
|
after_first = 'اگر اراده ای نباشد عشقی نیست. «گاندی»'
|
@@ -233,4 +240,19 @@ describe 'persian character methods' do
|
|
233
240
|
after = 'سلام. اسپیس کو؟'
|
234
241
|
expect(Persian::Text.space_after_dot(text)).to eq(after)
|
235
242
|
end
|
243
|
+
|
244
|
+
it 'should remove all repeated characters from text' do
|
245
|
+
text = 'سلااااااام.چی میکنییی؟؟؟؟؟'
|
246
|
+
after = 'سلام.چی میکنی؟'
|
247
|
+
|
248
|
+
expect(Persian::Text.squeeze(text)).to eq(after)
|
249
|
+
end
|
250
|
+
|
251
|
+
it 'should remove text postfix' do
|
252
|
+
text = 'پسره'
|
253
|
+
postfix = 'ه'
|
254
|
+
result = 'پسر'
|
255
|
+
|
256
|
+
expect(Persian::Text.remove_postfix(text, postfix)).to eq(result)
|
257
|
+
end
|
236
258
|
end
|
data/spec/tokenizer_spec.rb
CHANGED
@@ -10,6 +10,14 @@ describe 'persian tokenizers' do
|
|
10
10
|
expect(Persian::Tokenizer.tokenize(before)).to eq(after)
|
11
11
|
end
|
12
12
|
|
13
|
+
it 'should tokenize as a serie' do
|
14
|
+
text = 'سلام من به تو یار قدیمی'
|
15
|
+
parts = 3
|
16
|
+
result = ['سلام من به', 'من به تو', 'به تو یار', 'تو یار قدیمی']
|
17
|
+
|
18
|
+
expect(Persian::Tokenizer.tokenize_more(text, parts)).to eq(result)
|
19
|
+
end
|
20
|
+
|
13
21
|
it 'should split paragraphs' do
|
14
22
|
text = "
|
15
23
|
یوهانس برامس در سال ۱۸۳۳ در شهر هامبورگ آلمان در خانوادهای فقیر به دنیا آمد. تحصیلات ابتدایی موسیقی را نزد پدرش که نوازنده کنترباس بود فرا گرفت.
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: persian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dariush Abbasi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2022-03-25 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
@@ -43,6 +43,7 @@ files:
|
|
43
43
|
- lib/persian/dynamic.rb
|
44
44
|
- lib/persian/list/alphabet.rb
|
45
45
|
- lib/persian/list/character.rb
|
46
|
+
- lib/persian/list/homonyms.rb
|
46
47
|
- lib/persian/list/number.rb
|
47
48
|
- lib/persian/num_text.rb
|
48
49
|
- lib/persian/number.rb
|
@@ -63,7 +64,7 @@ files:
|
|
63
64
|
- spec/tokenizer_spec.rb
|
64
65
|
- spec/unicode_spec.rb
|
65
66
|
- spec/url_spec.rb
|
66
|
-
homepage: http://github.com/
|
67
|
+
homepage: http://github.com/dariubs/persian.rb
|
67
68
|
licenses:
|
68
69
|
- MIT
|
69
70
|
metadata: {}
|
@@ -82,8 +83,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
82
83
|
- !ruby/object:Gem::Version
|
83
84
|
version: '0'
|
84
85
|
requirements: []
|
85
|
-
|
86
|
-
rubygems_version: 2.5.1
|
86
|
+
rubygems_version: 3.2.5
|
87
87
|
signing_key:
|
88
88
|
specification_version: 4
|
89
89
|
summary: Persian language for ruby.
|