virastar 0.0.2 → 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/Gemfile.lock +1 -1
- data/TODO +12 -0
- data/lib/virastar/version.rb +1 -1
- data/lib/virastar.rb +5 -5
- data/spec/virastar_spec.rb +12 -4
- metadata +5 -4
data/Gemfile.lock
CHANGED
data/TODO
ADDED
@@ -0,0 +1,12 @@
|
|
1
|
+
- do not put space after : in the context of numbers like hour 19:45 => ۱۹:۴۵
|
2
|
+
- spacing after , : ; causing a lot of problem in this case (,) => (, )
|
3
|
+
- do not destroy urls dots and colons
|
4
|
+
|
5
|
+
- translate to js
|
6
|
+
|
7
|
+
|
8
|
+
DONE:
|
9
|
+
- he yeh => hamzeh should consider arabic yeh and zwnj chars
|
10
|
+
- replacing quotes shouldn't be greedy
|
11
|
+
- for suffixes also consider tari
|
12
|
+
- (IMP) do not replace \n probably caused by fixing spaces after dots and commas
|
data/lib/virastar/version.rb
CHANGED
data/lib/virastar.rb
CHANGED
@@ -39,10 +39,10 @@ module Virastar
|
|
39
39
|
text.gsub!(/\s*\.{3,}/,'…') if @fix_three_dots
|
40
40
|
|
41
41
|
# replace English quotes with their Persian equivalent
|
42
|
-
text.gsub!(/(["'`]+)(
|
42
|
+
text.gsub!(/(["'`]+)(.+?)(\1)/, '«\2»') if @fix_english_quotes
|
43
43
|
|
44
44
|
# should convert ه ی to ه
|
45
|
-
text.gsub!(/(\S)(ه[\s]
|
45
|
+
text.gsub!(/(\S)(ه[\s]+[یي])(\s)/, '\1هٔ\3') if @fix_hamzeh
|
46
46
|
|
47
47
|
# remove unnecessary zwnj char that are succeeded/preceded by a space
|
48
48
|
text.gsub!(/\s+|\s+/,' ') if @cleanup_zwnj
|
@@ -80,7 +80,7 @@ module Virastar
|
|
80
80
|
# put zwnj between word and suffix (*tar *tarin *ha *haye)
|
81
81
|
# there's a possible bug here: های and تر could be separate nouns and not suffix
|
82
82
|
if @fix_suffix_spacing
|
83
|
-
text.gsub!(/\s+(تر(
|
83
|
+
text.gsub!(/\s+(تر(ی(ن)?)?|ها(ی)?)\s+/,'\1 ') # in case you can not read it: \s+(tar(i(n)?)?|ha(ye)?)\s+
|
84
84
|
end
|
85
85
|
|
86
86
|
# -- Aggressive Editing ------------------------------------------
|
@@ -100,13 +100,13 @@ module Virastar
|
|
100
100
|
|
101
101
|
# : ; , . ! ? and their persian equivalents should have one space after and no space before
|
102
102
|
if @fix_spacing_for_braces_and_quotes
|
103
|
-
text.gsub!(
|
103
|
+
text.gsub!(/[ ]*([:;,؛،.؟!]{1})[ ]*/, '\1 ')
|
104
104
|
end
|
105
105
|
|
106
106
|
# should replace more than one space with just a single one
|
107
107
|
if @cleanup_spacing
|
108
108
|
text.gsub!(/[ ]+/,' ')
|
109
|
-
text.gsub!(/\s*[\n]+\s*/," \n")
|
109
|
+
#text.gsub!(/\s*[\n]+\s*/," \n")
|
110
110
|
end
|
111
111
|
|
112
112
|
# remove spaces, tabs, and new lines from the beginning and enf of file
|
data/spec/virastar_spec.rb
CHANGED
@@ -47,8 +47,10 @@ describe Virastar do
|
|
47
47
|
it "should correct :;,.?! spacing (one space after and no space before)" do
|
48
48
|
test = "گفت : سلام"
|
49
49
|
result = "گفت: سلام"
|
50
|
-
|
50
|
+
test2 = "salam.\n\nkhoobi"
|
51
|
+
result2 = "salam. \n\nkhoobi"
|
51
52
|
test.persian_cleanup.should == result
|
53
|
+
test2.persian_cleanup.should == result2
|
52
54
|
end
|
53
55
|
|
54
56
|
it "should replace English quotes with their Persian equivalent" do
|
@@ -59,11 +61,15 @@ describe Virastar do
|
|
59
61
|
test5 = "``تست``"
|
60
62
|
result = result2 = result4 = result5 = "«تست»"
|
61
63
|
result3 = "«گفت: سلام»"
|
64
|
+
# not greedy
|
65
|
+
test6 = '"this" or "that"'
|
66
|
+
result6 = '«this» or «that»'
|
62
67
|
test.persian_cleanup.should == result
|
63
68
|
test2.persian_cleanup.should == result2
|
64
69
|
test3.persian_cleanup.should == result3
|
65
70
|
test4.persian_cleanup.should == result4
|
66
71
|
test5.persian_cleanup.should == result5
|
72
|
+
test6.persian_cleanup.should == result6
|
67
73
|
end
|
68
74
|
|
69
75
|
it "should replace three dots with ellipsis" do
|
@@ -84,9 +90,11 @@ describe Virastar do
|
|
84
90
|
it "should convert ه ی to هٔ" do
|
85
91
|
test = "خانه ی ما"
|
86
92
|
test2 = "خانه ی ما"
|
87
|
-
|
93
|
+
test3 = "خانه ي ما"
|
94
|
+
result = result2 = result3 = "خانهٔ ما"
|
88
95
|
test.persian_cleanup.should == result
|
89
96
|
test2.persian_cleanup.should == result2
|
97
|
+
test3.persian_cleanup.should == result3
|
90
98
|
end
|
91
99
|
|
92
100
|
it "should replace double dash to ndash and triple dash to mdash" do
|
@@ -162,8 +170,8 @@ describe Virastar do
|
|
162
170
|
result2 = "ما نمیتوانیم"
|
163
171
|
test3 = "این بهترین کتاب ها است"
|
164
172
|
result3 = "این بهترین کتابها است"
|
165
|
-
test4 = "بزرگ
|
166
|
-
result4 = "
|
173
|
+
test4 = "بزرگ تری و قدرتمند ترین زبان های دنیا"
|
174
|
+
result4 = "بزرگتری و قدرتمندترین زبانهای دنیا"
|
167
175
|
test.persian_cleanup.should == result
|
168
176
|
end
|
169
177
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: virastar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 25
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 3
|
10
|
+
version: 0.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Allen A. Bargi
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-20 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|
@@ -67,6 +67,7 @@ files:
|
|
67
67
|
- LICENSE
|
68
68
|
- README.md
|
69
69
|
- Rakefile
|
70
|
+
- TODO
|
70
71
|
- lib/virastar.rb
|
71
72
|
- lib/virastar/version.rb
|
72
73
|
- spec/spec_helper.rb
|