persian 0.1.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.editorconfig +9 -0
- data/.gitignore +51 -0
- data/.rspec +3 -0
- data/.rubocop.yml +29 -0
- data/.travis.yml +8 -0
- data/Gemfile +10 -0
- data/Rakefile +36 -0
- data/lib/persian.rb +15 -9
- data/lib/persian/counter.rb +61 -0
- data/lib/persian/date.rb +150 -0
- data/lib/persian/dynamic.rb +38 -0
- data/lib/persian/list/alphabet.rb +107 -0
- data/lib/persian/list/character.rb +193 -0
- data/lib/persian/list/number.rb +154 -149
- data/lib/persian/num_text.rb +53 -0
- data/lib/persian/number.rb +69 -20
- data/lib/persian/text/keyboard.rb +22 -0
- data/lib/persian/text/text.rb +197 -0
- data/lib/persian/tokenizer.rb +42 -0
- data/lib/persian/unicode.rb +42 -0
- data/lib/persian/url.rb +25 -0
- data/lib/persian/version.rb +2 -1
- data/persian.gemspec +26 -0
- data/readme.md +47 -0
- data/spec/counter_spec.rb +83 -0
- data/spec/dynamic_spec.rb +6 -0
- data/spec/num_text_spec.rb +17 -0
- data/spec/number_spec.rb +129 -0
- data/spec/spec_helper.rb +7 -0
- data/spec/text_spec.rb +236 -0
- data/spec/tokenizer_spec.rb +23 -0
- data/spec/unicode_spec.rb +25 -0
- data/spec/url_spec.rb +11 -0
- metadata +38 -12
- data/lib/persian/character.rb +0 -26
- data/lib/persian/num_to_char.rb +0 -60
@@ -0,0 +1,23 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe 'persian tokenizers' do
|
6
|
+
it 'should return list of words and special persian characters ' do
|
7
|
+
before = 'آیا روزی به اسرار این اتفاقات ماوراء طبیعی، این انعکاس سایهٔ روح که در حالت اغماء و برزخ بین خواب و بیداری جلوه میکند کسی پی خواهد برد؟'
|
8
|
+
after = ['آیا', 'روزی', 'به', 'اسرار', 'این', 'اتفاقات', 'ماوراء', 'طبیعی', '،', 'این', 'انعکاس', 'سایهٔ', 'روح', 'که', 'در', 'حالت', 'اغماء', 'و', 'برزخ', 'بین', 'خواب', 'و', 'بیداری', 'جلوه', 'میکند', 'کسی', 'پی', 'خواهد', 'برد', '؟']
|
9
|
+
|
10
|
+
expect(Persian::Tokenizer.tokenize(before)).to eq(after)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should split paragraphs' do
|
14
|
+
text = "
|
15
|
+
یوهانس برامس در سال ۱۸۳۳ در شهر هامبورگ آلمان در خانوادهای فقیر به دنیا آمد. تحصیلات ابتدایی موسیقی را نزد پدرش که نوازنده کنترباس بود فرا گرفت.
|
16
|
+
برامس با ویولونیستهای مشهوری چون رمنی و یواخیم آشنا شد و در طول این آشنایی بود که رمنی موسیقی محلی مجارستان را به برامس معرفی کرد و تحت تأثیر آن برامس رقصهای مجار خود را نوشت.
|
17
|
+
"
|
18
|
+
result = ['یوهانس برامس در سال ۱۸۳۳ در شهر هامبورگ آلمان در خانوادهای فقیر به دنیا آمد. تحصیلات ابتدایی موسیقی را نزد پدرش که نوازنده کنترباس بود فرا گرفت.',
|
19
|
+
'برامس با ویولونیستهای مشهوری چون رمنی و یواخیم آشنا شد و در طول این آشنایی بود که رمنی موسیقی محلی مجارستان را به برامس معرفی کرد و تحت تأثیر آن برامس رقصهای مجار خود را نوشت.']
|
20
|
+
|
21
|
+
expect(Persian::Tokenizer.split_paragraphs(text)).to eq(result)
|
22
|
+
end
|
23
|
+
end
|
@@ -0,0 +1,25 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe 'persian character methods' do
|
6
|
+
it 'should return unicode character from codepoint hex' do
|
7
|
+
codepoint = 0x062B
|
8
|
+
char = 'ث'
|
9
|
+
|
10
|
+
expect(Persian::Unicode.codepoint_to_char(codepoint)).to eq(char)
|
11
|
+
end
|
12
|
+
|
13
|
+
it 'should return text between RIGHT-TO-LETF EMBEDDING(U+202B) and Pop Directional Format(U+202C)' do
|
14
|
+
before = 'مست+'
|
15
|
+
after = 'مست+'
|
16
|
+
expect(Persian::Unicode.rle(before)).to eq(after)
|
17
|
+
end
|
18
|
+
|
19
|
+
it 'should text between return LETF-TO-RIGHT EMBEDDING(U+202A) and Pop Directional Format(U+202C)' do
|
20
|
+
before = 'من c++ بلدم'
|
21
|
+
after = 'من c++ بلدم'
|
22
|
+
|
23
|
+
expect(Persian::Unicode.lre(before)).to eq(after)
|
24
|
+
end
|
25
|
+
end
|
data/spec/url_spec.rb
ADDED
@@ -0,0 +1,11 @@
|
|
1
|
+
# -*- coding: UTF-8 -*-
|
2
|
+
|
3
|
+
require 'spec_helper'
|
4
|
+
|
5
|
+
describe 'persian url normalizer methods' do
|
6
|
+
it 'should normalize persian text fo use in url' do
|
7
|
+
text = '«دونالد ترامپ» پیروز انتخابات ایالات متحده شد'
|
8
|
+
normal = 'دونالد-ترامپ-پیروز-انتخابات-ایالات-متحده-شد'
|
9
|
+
expect(Persian::Url.urlify(text)).to eq(normal)
|
10
|
+
end
|
11
|
+
end
|
metadata
CHANGED
@@ -1,42 +1,69 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: persian
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Dariush Abbasi
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-
|
11
|
+
date: 2016-11-16 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rspec
|
15
15
|
requirement: !ruby/object:Gem::Requirement
|
16
16
|
requirements:
|
17
|
-
- -
|
17
|
+
- - '='
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version: 3.4
|
19
|
+
version: '3.4'
|
20
20
|
type: :development
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
|
-
- -
|
24
|
+
- - '='
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version: 3.4
|
26
|
+
version: '3.4'
|
27
27
|
description: A set of utilities for Persian language.
|
28
28
|
email: poshtehani@gmail.com
|
29
29
|
executables: []
|
30
30
|
extensions: []
|
31
31
|
extra_rdoc_files: []
|
32
32
|
files:
|
33
|
+
- ".editorconfig"
|
34
|
+
- ".gitignore"
|
35
|
+
- ".rspec"
|
36
|
+
- ".rubocop.yml"
|
37
|
+
- ".travis.yml"
|
38
|
+
- Gemfile
|
39
|
+
- Rakefile
|
33
40
|
- lib/persian.rb
|
34
|
-
- lib/persian/
|
41
|
+
- lib/persian/counter.rb
|
42
|
+
- lib/persian/date.rb
|
43
|
+
- lib/persian/dynamic.rb
|
44
|
+
- lib/persian/list/alphabet.rb
|
45
|
+
- lib/persian/list/character.rb
|
35
46
|
- lib/persian/list/number.rb
|
36
|
-
- lib/persian/
|
47
|
+
- lib/persian/num_text.rb
|
37
48
|
- lib/persian/number.rb
|
49
|
+
- lib/persian/text/keyboard.rb
|
50
|
+
- lib/persian/text/text.rb
|
51
|
+
- lib/persian/tokenizer.rb
|
52
|
+
- lib/persian/unicode.rb
|
53
|
+
- lib/persian/url.rb
|
38
54
|
- lib/persian/version.rb
|
39
|
-
|
55
|
+
- persian.gemspec
|
56
|
+
- readme.md
|
57
|
+
- spec/counter_spec.rb
|
58
|
+
- spec/dynamic_spec.rb
|
59
|
+
- spec/num_text_spec.rb
|
60
|
+
- spec/number_spec.rb
|
61
|
+
- spec/spec_helper.rb
|
62
|
+
- spec/text_spec.rb
|
63
|
+
- spec/tokenizer_spec.rb
|
64
|
+
- spec/unicode_spec.rb
|
65
|
+
- spec/url_spec.rb
|
66
|
+
homepage: http://github.com/negah/persian
|
40
67
|
licenses:
|
41
68
|
- MIT
|
42
69
|
metadata: {}
|
@@ -56,9 +83,8 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
56
83
|
version: '0'
|
57
84
|
requirements: []
|
58
85
|
rubyforge_project:
|
59
|
-
rubygems_version: 2.
|
86
|
+
rubygems_version: 2.5.1
|
60
87
|
signing_key:
|
61
88
|
specification_version: 4
|
62
|
-
summary:
|
89
|
+
summary: Persian language for ruby.
|
63
90
|
test_files: []
|
64
|
-
has_rdoc:
|
data/lib/persian/character.rb
DELETED
@@ -1,26 +0,0 @@
|
|
1
|
-
# -*- coding: UTF-8 -*-
|
2
|
-
|
3
|
-
class Persian
|
4
|
-
def self.character char
|
5
|
-
|
6
|
-
chars = {
|
7
|
-
"ك" => "ک",
|
8
|
-
"دِ" => "د",
|
9
|
-
"بِ" => "ب",
|
10
|
-
"زِ" => "ز",
|
11
|
-
"ذِ" => "ذ",
|
12
|
-
"شِ" => "ش",
|
13
|
-
"سِ" => "س",
|
14
|
-
"ى" => "ی",
|
15
|
-
"ي" => "ی",
|
16
|
-
"ة" => "ه",
|
17
|
-
"هٔ" => "ه"
|
18
|
-
}
|
19
|
-
|
20
|
-
chars.each {|k, v|
|
21
|
-
char.gsub!(k, v)
|
22
|
-
}
|
23
|
-
|
24
|
-
return char
|
25
|
-
end
|
26
|
-
end
|
data/lib/persian/num_to_char.rb
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
# -*- coding: UTF-8 -*-
|
2
|
-
|
3
|
-
class Persian
|
4
|
-
def self.num_to_char num, inner = false
|
5
|
-
|
6
|
-
if num.kind_of? String
|
7
|
-
num = Persian.number(num, {lang: "en", return: "int"})
|
8
|
-
end
|
9
|
-
|
10
|
-
words = ""
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
if ((num / 100) >= 0) && ((num / 100) < 10)
|
15
|
-
|
16
|
-
if num < 10
|
17
|
-
if (num == 0) && inner
|
18
|
-
words += ""
|
19
|
-
elsif inner
|
20
|
-
words += " و "
|
21
|
-
words += @ones[num]
|
22
|
-
else
|
23
|
-
words += @ones[num]
|
24
|
-
end
|
25
|
-
|
26
|
-
elsif num < 20
|
27
|
-
words += @teens[num - 10]
|
28
|
-
elsif num < 100
|
29
|
-
if inner
|
30
|
-
words += " و "
|
31
|
-
end
|
32
|
-
|
33
|
-
words += @decimal[num/10] + num_to_char(num%10, true)
|
34
|
-
else
|
35
|
-
if inner
|
36
|
-
words += " و "
|
37
|
-
end
|
38
|
-
|
39
|
-
words += @hundreds[num/100] + num_to_char(num%100, true)
|
40
|
-
end
|
41
|
-
else
|
42
|
-
if inner
|
43
|
-
words += " و "
|
44
|
-
end
|
45
|
-
|
46
|
-
if (num.to_s.length%3) != 0
|
47
|
-
current_split = num.to_s[0..(num.to_s.length%3 - 1).to_i]
|
48
|
-
more_split = num.to_s.sub(current_split, "")
|
49
|
-
words += num_to_char(current_split) + " " + @longscale[(num.to_s.length / 3)] + num_to_char(more_split, true)
|
50
|
-
else
|
51
|
-
current_split = num.to_s[0..2]
|
52
|
-
more_split = num.to_s.sub(current_split, "")
|
53
|
-
words += num_to_char(current_split) + " " + @longscale[(num.to_s.length / 3)-1] + num_to_char(more_split, true)
|
54
|
-
end
|
55
|
-
end
|
56
|
-
|
57
|
-
words
|
58
|
-
|
59
|
-
end
|
60
|
-
end
|