virastar 0.0.4 → 0.0.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/TODO +1 -10
- data/lib/virastar/version.rb +1 -1
- data/lib/virastar.rb +32 -10
- data/spec/virastar_spec.rb +16 -8
- metadata +4 -4
data/TODO
CHANGED
@@ -1,10 +1 @@
|
|
1
|
-
-
|
2
|
-
- do not destroy urls dots and colons
|
3
|
-
/https?://([-\w\.]+)+(:\d+)?(/([\w/_\.]*(\?\S+)?)?)?/
|
4
|
-
|
5
|
-
- translate to js
|
6
|
-
|
7
|
-
|
8
|
-
DONE:
|
9
|
-
- spacing after , : ; causing a lot of problem in this case (,) => (, )
|
10
|
-
- do not put space after : in the context of numbers like hour 19:45 => ۱۹:۴۵
|
1
|
+
- translate to js
|
data/lib/virastar/version.rb
CHANGED
data/lib/virastar.rb
CHANGED
@@ -29,6 +29,16 @@ module Virastar
|
|
29
29
|
|
30
30
|
def cleanup
|
31
31
|
text = @text
|
32
|
+
|
33
|
+
# removing URLS bringing them back at the end of process
|
34
|
+
urls = []
|
35
|
+
i = 0
|
36
|
+
text.gsub!(/https?:\/\/([-\w\.]+)+(:\d+)?(\/([\w\/_\.]*(\?\S+)?)?)?/) do |s|
|
37
|
+
urls[i] = s.dup
|
38
|
+
i += 1
|
39
|
+
"__urls__#{i}__"
|
40
|
+
end
|
41
|
+
|
32
42
|
# replace double dash to ndash and triple dash to mdash
|
33
43
|
if @fix_dashes
|
34
44
|
text.gsub!(/-{3}/,'—')
|
@@ -89,33 +99,45 @@ module Virastar
|
|
89
99
|
end
|
90
100
|
# ----------------------------------------------------------------
|
91
101
|
|
102
|
+
# should fix outside and inside spacing for () [] {} “” «»
|
103
|
+
if @fix_spacing_for_braces_and_quotes
|
104
|
+
text.gsub!(/[ ]*(\()\s*([^)]+?)\s*?(\))[ ]*/,' \1\2\3 ')
|
105
|
+
text.gsub!(/[ ]*(\[)\s*([^)]+?)\s*?(\])[ ]*/,' \1\2\3 ')
|
106
|
+
text.gsub!(/[ ]*(\{)\s*([^)]+?)\s*?(\})[ ]*/,' \1\2\3 ')
|
107
|
+
text.gsub!(/[ ]*(“)\s*([^)]+?)\s*?(”)[ ]*/,' \1\2\3 ')
|
108
|
+
text.gsub!(/[ ]*(«)\s*([^)]+?)\s*?(»)[ ]*/,' \1\2\3 ')
|
109
|
+
end
|
110
|
+
|
92
111
|
# : ; , . ! ? and their persian equivalents should have one space after and no space before
|
93
112
|
if @fix_spacing_for_braces_and_quotes
|
94
|
-
text.gsub!(/[
|
113
|
+
text.gsub!(/[ ]*([:;,؛،.؟!]{1})[ ]*/, '\1 ')
|
95
114
|
# do not put space after colon that separates time parts
|
96
115
|
text.gsub!(/([۰-۹]+):\s+([۰-۹]+)/, '\1:\2')
|
97
116
|
end
|
98
|
-
|
99
|
-
|
100
117
|
|
101
|
-
# should fix spacing for () [] {} “” «»
|
118
|
+
# should fix inside spacing for () [] {} “” «»
|
102
119
|
if @fix_spacing_for_braces_and_quotes
|
103
|
-
text.gsub!(
|
104
|
-
text.gsub!(
|
105
|
-
text.gsub!(
|
106
|
-
text.gsub!(
|
107
|
-
text.gsub!(
|
120
|
+
text.gsub!(/(\()\s*([^)]+?)\s*?(\))/,'\1\2\3')
|
121
|
+
text.gsub!(/(\[)\s*([^)]+?)\s*?(\])/,'\1\2\3')
|
122
|
+
text.gsub!(/(\{)\s*([^)]+?)\s*?(\})/,'\1\2\3')
|
123
|
+
text.gsub!(/(“)\s*([^)]+?)\s*?(”)/,'\1\2\3')
|
124
|
+
text.gsub!(/(«)\s*([^)]+?)\s*?(»)/,'\1\2\3')
|
108
125
|
end
|
109
126
|
|
110
127
|
# should replace more than one space with just a single one
|
111
128
|
if @cleanup_spacing
|
112
129
|
text.gsub!(/[ ]+/,' ')
|
113
|
-
text.gsub!(/([\n]+)[
|
130
|
+
text.gsub!(/([\n]+)[ ]*/,'\1')
|
114
131
|
end
|
115
132
|
|
116
133
|
# remove spaces, tabs, and new lines from the beginning and enf of file
|
117
134
|
text.strip! if @cleanup_begin_and_end
|
118
135
|
|
136
|
+
# bringing back urls
|
137
|
+
text.gsub!(/__urls__\d+__/) do |s|
|
138
|
+
urls[s.split("__").last.to_i - 1]
|
139
|
+
end
|
140
|
+
|
119
141
|
text
|
120
142
|
end
|
121
143
|
|
data/spec/virastar_spec.rb
CHANGED
@@ -193,16 +193,24 @@ describe Virastar do
|
|
193
193
|
test.persian_cleanup.should == result
|
194
194
|
end
|
195
195
|
|
196
|
-
it "should not destroy URLs"
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
196
|
+
it "should not destroy URLs" do
|
197
|
+
test = "http://virastar.heroku.com"
|
198
|
+
result = "http://virastar.heroku.com"
|
199
|
+
test2 = "http://virastar.heroku.com\nhttp://balatarin.com"
|
200
|
+
result2 = "http://virastar.heroku.com\nhttp://balatarin.com"
|
201
|
+
test.persian_cleanup.should == result
|
202
|
+
test2.persian_cleanup.should == result2
|
203
|
+
end
|
202
204
|
|
203
205
|
it "should not replace line breaks when the line ends with quotes" do
|
204
|
-
test =
|
205
|
-
result =
|
206
|
+
test = "salam \"khoobi\" \n chetori"
|
207
|
+
result = "salam «khoobi» \nchetori"
|
208
|
+
test.persian_cleanup.should == result
|
209
|
+
end
|
210
|
+
|
211
|
+
it "should not put space after quotes, {}, () or [] if there's ,.; just after that" do
|
212
|
+
test = "«This», {this}, (this), [this] or {this}. sometimes (this)."
|
213
|
+
result = "«This»، {this}، (this)، [this] or {this}. sometimes (this)."
|
206
214
|
test.persian_cleanup.should == result
|
207
215
|
end
|
208
216
|
|
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: virastar
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 21
|
5
5
|
prerelease:
|
6
6
|
segments:
|
7
7
|
- 0
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 0.0.
|
9
|
+
- 5
|
10
|
+
version: 0.0.5
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- Allen A. Bargi
|
@@ -15,7 +15,7 @@ autorequire:
|
|
15
15
|
bindir: bin
|
16
16
|
cert_chain: []
|
17
17
|
|
18
|
-
date: 2011-01-
|
18
|
+
date: 2011-01-23 00:00:00 +01:00
|
19
19
|
default_executable:
|
20
20
|
dependencies:
|
21
21
|
- !ruby/object:Gem::Dependency
|