virastar 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/Gemfile.lock +1 -1
- data/TODO +12 -0
- data/lib/virastar/version.rb +1 -1
- data/lib/virastar.rb +5 -5
- data/spec/virastar_spec.rb +12 -4
- metadata +5 -4
data/Gemfile.lock
CHANGED
data/TODO
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
- do not put space after : in the context of numbers like hour 19:45 => ۱۹:۴۵
|
|
2
|
+
- spacing after , : ; causing a lot of problem in this case (,) => (, )
|
|
3
|
+
- do not destroy urls dots and colons
|
|
4
|
+
|
|
5
|
+
- translate to js
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
DONE:
|
|
9
|
+
- he yeh => hamzeh should consider arabic yeh and zwnj chars
|
|
10
|
+
- replacing quotes shouldn't be greedy
|
|
11
|
+
- for suffixes also consider tari
|
|
12
|
+
- (IMP) do not replace \n probably caused by fixing spaces after dots and commas
|
data/lib/virastar/version.rb
CHANGED
data/lib/virastar.rb
CHANGED
|
@@ -39,10 +39,10 @@ module Virastar
|
|
|
39
39
|
text.gsub!(/\s*\.{3,}/,'…') if @fix_three_dots
|
|
40
40
|
|
|
41
41
|
# replace English quotes with their Persian equivalent
|
|
42
|
-
text.gsub!(/(["'`]+)(
|
|
42
|
+
text.gsub!(/(["'`]+)(.+?)(\1)/, '«\2»') if @fix_english_quotes
|
|
43
43
|
|
|
44
44
|
# should convert ه ی to ه
|
|
45
|
-
text.gsub!(/(\S)(ه[\s]
|
|
45
|
+
text.gsub!(/(\S)(ه[\s]+[یي])(\s)/, '\1هٔ\3') if @fix_hamzeh
|
|
46
46
|
|
|
47
47
|
# remove unnecessary zwnj char that are succeeded/preceded by a space
|
|
48
48
|
text.gsub!(/\s+|\s+/,' ') if @cleanup_zwnj
|
|
@@ -80,7 +80,7 @@ module Virastar
|
|
|
80
80
|
# put zwnj between word and suffix (*tar *tarin *ha *haye)
|
|
81
81
|
# there's a possible bug here: های and تر could be separate nouns and not suffix
|
|
82
82
|
if @fix_suffix_spacing
|
|
83
|
-
text.gsub!(/\s+(تر(
|
|
83
|
+
text.gsub!(/\s+(تر(ی(ن)?)?|ها(ی)?)\s+/,'\1 ') # in case you can not read it: \s+(tar(i(n)?)?|ha(ye)?)\s+
|
|
84
84
|
end
|
|
85
85
|
|
|
86
86
|
# -- Aggressive Editing ------------------------------------------
|
|
@@ -100,13 +100,13 @@ module Virastar
|
|
|
100
100
|
|
|
101
101
|
# : ; , . ! ? and their persian equivalents should have one space after and no space before
|
|
102
102
|
if @fix_spacing_for_braces_and_quotes
|
|
103
|
-
text.gsub!(
|
|
103
|
+
text.gsub!(/[ ]*([:;,؛،.؟!]{1})[ ]*/, '\1 ')
|
|
104
104
|
end
|
|
105
105
|
|
|
106
106
|
# should replace more than one space with just a single one
|
|
107
107
|
if @cleanup_spacing
|
|
108
108
|
text.gsub!(/[ ]+/,' ')
|
|
109
|
-
text.gsub!(/\s*[\n]+\s*/," \n")
|
|
109
|
+
#text.gsub!(/\s*[\n]+\s*/," \n")
|
|
110
110
|
end
|
|
111
111
|
|
|
112
112
|
# remove spaces, tabs, and new lines from the beginning and enf of file
|
data/spec/virastar_spec.rb
CHANGED
|
@@ -47,8 +47,10 @@ describe Virastar do
|
|
|
47
47
|
it "should correct :;,.?! spacing (one space after and no space before)" do
|
|
48
48
|
test = "گفت : سلام"
|
|
49
49
|
result = "گفت: سلام"
|
|
50
|
-
|
|
50
|
+
test2 = "salam.\n\nkhoobi"
|
|
51
|
+
result2 = "salam. \n\nkhoobi"
|
|
51
52
|
test.persian_cleanup.should == result
|
|
53
|
+
test2.persian_cleanup.should == result2
|
|
52
54
|
end
|
|
53
55
|
|
|
54
56
|
it "should replace English quotes with their Persian equivalent" do
|
|
@@ -59,11 +61,15 @@ describe Virastar do
|
|
|
59
61
|
test5 = "``تست``"
|
|
60
62
|
result = result2 = result4 = result5 = "«تست»"
|
|
61
63
|
result3 = "«گفت: سلام»"
|
|
64
|
+
# not greedy
|
|
65
|
+
test6 = '"this" or "that"'
|
|
66
|
+
result6 = '«this» or «that»'
|
|
62
67
|
test.persian_cleanup.should == result
|
|
63
68
|
test2.persian_cleanup.should == result2
|
|
64
69
|
test3.persian_cleanup.should == result3
|
|
65
70
|
test4.persian_cleanup.should == result4
|
|
66
71
|
test5.persian_cleanup.should == result5
|
|
72
|
+
test6.persian_cleanup.should == result6
|
|
67
73
|
end
|
|
68
74
|
|
|
69
75
|
it "should replace three dots with ellipsis" do
|
|
@@ -84,9 +90,11 @@ describe Virastar do
|
|
|
84
90
|
it "should convert ه ی to هٔ" do
|
|
85
91
|
test = "خانه ی ما"
|
|
86
92
|
test2 = "خانه ی ما"
|
|
87
|
-
|
|
93
|
+
test3 = "خانه ي ما"
|
|
94
|
+
result = result2 = result3 = "خانهٔ ما"
|
|
88
95
|
test.persian_cleanup.should == result
|
|
89
96
|
test2.persian_cleanup.should == result2
|
|
97
|
+
test3.persian_cleanup.should == result3
|
|
90
98
|
end
|
|
91
99
|
|
|
92
100
|
it "should replace double dash to ndash and triple dash to mdash" do
|
|
@@ -162,8 +170,8 @@ describe Virastar do
|
|
|
162
170
|
result2 = "ما نمیتوانیم"
|
|
163
171
|
test3 = "این بهترین کتاب ها است"
|
|
164
172
|
result3 = "این بهترین کتابها است"
|
|
165
|
-
test4 = "بزرگ
|
|
166
|
-
result4 = "
|
|
173
|
+
test4 = "بزرگ تری و قدرتمند ترین زبان های دنیا"
|
|
174
|
+
result4 = "بزرگتری و قدرتمندترین زبانهای دنیا"
|
|
167
175
|
test.persian_cleanup.should == result
|
|
168
176
|
end
|
|
169
177
|
|
metadata
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: virastar
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
hash:
|
|
4
|
+
hash: 25
|
|
5
5
|
prerelease:
|
|
6
6
|
segments:
|
|
7
7
|
- 0
|
|
8
8
|
- 0
|
|
9
|
-
-
|
|
10
|
-
version: 0.0.
|
|
9
|
+
- 3
|
|
10
|
+
version: 0.0.3
|
|
11
11
|
platform: ruby
|
|
12
12
|
authors:
|
|
13
13
|
- Allen A. Bargi
|
|
@@ -15,7 +15,7 @@ autorequire:
|
|
|
15
15
|
bindir: bin
|
|
16
16
|
cert_chain: []
|
|
17
17
|
|
|
18
|
-
date: 2011-01-
|
|
18
|
+
date: 2011-01-20 00:00:00 +01:00
|
|
19
19
|
default_executable:
|
|
20
20
|
dependencies:
|
|
21
21
|
- !ruby/object:Gem::Dependency
|
|
@@ -67,6 +67,7 @@ files:
|
|
|
67
67
|
- LICENSE
|
|
68
68
|
- README.md
|
|
69
69
|
- Rakefile
|
|
70
|
+
- TODO
|
|
70
71
|
- lib/virastar.rb
|
|
71
72
|
- lib/virastar/version.rb
|
|
72
73
|
- spec/spec_helper.rb
|