virastar 0.0.4 → 0.0.5
Sign up to get free protection for your applications and to get access to all the features.
- data/TODO +1 -10
- data/lib/virastar/version.rb +1 -1
- data/lib/virastar.rb +32 -10
- data/spec/virastar_spec.rb +16 -8
- metadata +4 -4
data/TODO
CHANGED
@@ -1,10 +1 @@
|
|
1
|
-
-
|
2
|
-
- do not destroy urls dots and colons
|
3
|
-
/https?://([-\w\.]+)+(:\d+)?(/([\w/_\.]*(\?\S+)?)?)?/
|
4
|
-
|
5
|
-
- translate to js
|
6
|
-
|
7
|
-
|
8
|
-
DONE:
|
9
|
-
- spacing after , : ; causing a lot of problem in this case (,) => (, )
|
10
|
-
- do not put space after : in the context of numbers like hour 19:45 => ۱۹:۴۵
|
1
|
+
- translate to js
|
data/lib/virastar/version.rb
CHANGED
data/lib/virastar.rb
CHANGED
@@ -29,6 +29,16 @@ module Virastar
|
|
29
29
|
|
30
30
|
def cleanup
|
31
31
|
text = @text
|
32
|
+
|
33
|
+
# removing URLS bringing them back at the end of process
|
34
|
+
urls = []
|
35
|
+
i = 0
|
36
|
+
text.gsub!(/https?:\/\/([-\w\.]+)+(:\d+)?(\/([\w\/_\.]*(\?\S+)?)?)?/) do |s|
|
37
|
+
urls[i] = s.dup
|
38
|
+
i += 1
|
39
|
+
"__urls__#{i}__"
|
40
|
+
end
|
41
|
+
|
32
42
|
# replace double dash to ndash and triple dash to mdash
|
33
43
|
if @fix_dashes
|
34
44
|
text.gsub!(/-{3}/,'—')
|
@@ -89,33 +99,45 @@ module Virastar
|
|
89
99
|
end
|
90
100
|
# ----------------------------------------------------------------
|
91
101
|
|
102
|
+
# should fix outside and inside spacing for () [] {} “” «»
|
103
|
+
if @fix_spacing_for_braces_and_quotes
|
104
|
+
text.gsub!(/[ ]*(\()\s*([^)]+?)\s*?(\))[ ]*/,' \1\2\3 ')
|
105
|
+
text.gsub!(/[ ]*(\[)\s*([^)]+?)\s*?(\])[ ]*/,' \1\2\3 ')
|
106
|
+
text.gsub!(/[ ]*(\{)\s*([^)]+?)\s*?(\})[ ]*/,' \1\2\3 ')
|
107
|
+
text.gsub!(/[ ]*(“)\s*([^)]+?)\s*?(”)[ ]*/,' \1\2\3 ')
|
108
|
+
text.gsub!(/[ ]*(«)\s*([^)]+?)\s*?(»)[ ]*/,' \1\2\3 ')
|
109
|
+
end
|
110
|
+
|
92
111
|
# : ; , . ! ? and their persian equivalents should have one space after and no space before
|
93
112
|
if @fix_spacing_for_braces_and_quotes
|
94
|
-
text.gsub!(/[
|
113
|
+
text.gsub!(/[ ]*([:;,؛،.؟!]{1})[ ]*/, '\1 ')
|
95
114
|
# do not put space after colon that separates time parts
|
96
115
|
text.gsub!(/([۰-۹]+):\s+([۰-۹]+)/, '\1:\2')
|
97
116
|
end
|
98
|
-
|
99
|
-
|
100
117
|
|
101
|
-
# should fix spacing for () [] {} “” «»
|
118
|
+
# should fix inside spacing for () [] {} “” «»
|
102
119
|
if @fix_spacing_for_braces_and_quotes
|
103
|
-
text.gsub!(
|
104
|
-
text.gsub!(
|
105
|
-
text.gsub!(
|
106
|
-
text.gsub!(
|
107
|
-
text.gsub!(
|
120
|
+
text.gsub!(/(\()\s*([^)]+?)\s*?(\))/,'\1\2\3')
|
121
|
+
text.gsub!(/(\[)\s*([^)]+?)\s*?(\])/,'\1\2\3')
|
122
|
+
text.gsub!(/(\{)\s*([^)]+?)\s*?(\})/,'\1\2\3')
|
123
|
+
text.gsub!(/(“)\s*([^)]+?)\s*?(”)/,'\1\2\3')
|
124
|
+
text.gsub!(/(«)\s*([^)]+?)\s*?(»)/,'\1\2\3')
|
108
125
|
end
|
109
126
|
|
110
127
|
# should replace more than one space with just a single one
|
111
128
|
if @cleanup_spacing
|
112
129
|
text.gsub!(/[ ]+/,' ')
|
113
|
-
text.gsub!(/([\n]+)[
|
130
|
+
text.gsub!(/([\n]+)[ ]*/,'\1')
|
114
131
|
end
|
115
132
|
|
116
133
|
# remove spaces, tabs, and new lines from the beginning and enf of file
|
117
134
|
text.strip! if @cleanup_begin_and_end
|
118
135
|
|
136
|
+
# bringing back urls
|
137
|
+
text.gsub!(/__urls__\d+__/) do |s|
|
138
|
+
urls[s.split("__").last.to_i - 1]
|
139
|
+
end
|
140
|
+
|
119
141
|
text
|
120
142
|
end
|
121
143
|
|
data/spec/virastar_spec.rb
CHANGED
@@ -193,16 +193,24 @@ describe Virastar do
|
|
193
193
|
test.persian_cleanup.should == result
|
194
194
|
end
|
195
195
|
|
196
|
-
it "should not destroy URLs"
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
196
|
+
it "should not destroy URLs" do
|
197
|
+
test = "http://virastar.heroku.com"
|
198
|
+
result = "http://virastar.heroku.com"
|
199
|
+
test2 = "http://virastar.heroku.com\nhttp://balatarin.com"
|
200
|
+
result2 = "http://virastar.heroku.com\nhttp://balatarin.com"
|
201
|
+
test.persian_cleanup.should == result
|
202
|
+
test2.persian_cleanup.should == result2
|
203
|
+
end
|
202
204
|
|
203
205
|
it "should not replace line breaks when the line ends with quotes" do
|
204
|
-
test =
|
205
|
-
result =
|
206
|
+
test = "salam \"khoobi\" \n chetori"
|
207
|
+
result = "salam «khoobi» \nchetori"
|
208
|
+
test.persian_cleanup.should == result
|
209
|
+
end
|
210
|
+
|
211
|
+
it "should not put space after quotes, {}, () or [] if there's ,.; just after that" do
|
212
|
+
test = "«This», {this}, (this), [this] or {this}. sometimes (this)."
|
213
|
+
result = "«This»، {this}، (this)، [this] or {this}. sometimes (this)."
|
206
214
|
test.persian_cleanup.should == result
|
207
215
|
end
|
208
216
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: virastar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 5
|
10
|
+
version: 0.0.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Allen A. Bargi
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-23 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|