string_cleaner 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore CHANGED
@@ -1,5 +1,6 @@
1
- *.sw?
1
+ .bundle
2
2
  .DS_Store
3
3
  coverage
4
4
  rdoc
5
5
  pkg
6
+ *.gem
data/Rakefile CHANGED
@@ -1,46 +1,26 @@
1
- require "rubygems"
2
- require "rake"
3
-
1
+ require 'rubygems'
4
2
  begin
5
- require "jeweler"
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "string_cleaner"
8
- gem.summary = %Q{TODO}
9
- gem.email = "joseph@openhood.com"
10
- gem.homepage = "http://github.com/JosephHalter/string_cleaner"
11
- gem.authors = ["Joseph Halter"]
12
-
13
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
- end
3
+ require 'bundler/setup'
15
4
  rescue LoadError
16
- puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
5
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
17
6
  end
18
7
 
19
- require "spec/rake/spectask"
20
- Spec::Rake::SpecTask.new(:spec) do |spec|
21
- spec.libs << "lib" << "spec"
22
- spec.spec_files = FileList["spec/**/*_spec.rb"]
8
+ require 'rspec/core/rake_task'
9
+ RSpec::Core::RakeTask.new(:spec) do |spec|
10
+ spec.pattern = 'spec/**/*_spec.rb'
23
11
  end
24
12
 
25
- Spec::Rake::SpecTask.new(:rcov) do |spec|
26
- spec.libs << "lib" << "spec"
27
- spec.pattern = "spec/**/*_spec.rb"
13
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
14
+ spec.pattern = 'spec/**/*_spec.rb'
28
15
  spec.rcov = true
29
16
  end
30
17
 
31
18
  task :default => :spec
32
19
 
33
- require "rake/rdoctask"
20
+ require 'rake/rdoctask'
34
21
  Rake::RDocTask.new do |rdoc|
35
- if File.exist?("VERSION.yml")
36
- config = YAML.load(File.read("VERSION.yml"))
37
- version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
38
- else
39
- version = ""
40
- end
41
-
42
- rdoc.rdoc_dir = "rdoc"
43
- rdoc.title = "test #{version}"
44
- rdoc.rdoc_files.include("README*")
45
- rdoc.rdoc_files.include("lib/**/*.rb")
46
- end
22
+ rdoc.rdoc_dir = 'rdoc'
23
+ rdoc.title = "string_cleaner"
24
+ rdoc.rdoc_files.include('README*')
25
+ rdoc.rdoc_files.include('lib/**/*.rb')
26
+ end
@@ -1,56 +1,170 @@
1
1
  # encoding: UTF-8
2
+ require "unidecoder"
3
+
2
4
  module String::Cleaner
3
5
 
4
- # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
5
- # recognize euro char from both ISO 8859-15 and Windows-1252
6
- # replace \r\n and \r with \n normalizing end of lines
7
- # replace control characters and other invisible chars by spaces
8
6
  def clean
9
- utf8 = self.dup
10
- if utf8.respond_to?(:force_encoding)
7
+ fix_encoding.fix_endlines.fix_invisible_chars
8
+ end
11
9
 
12
- # for Ruby 1.9+
13
- utf8.force_encoding("UTF-8")
10
+ def fix_encoding
11
+ utf8 = dup
12
+ if utf8.respond_to?(:force_encoding)
13
+ utf8.force_encoding("UTF-8") # for Ruby 1.9+
14
14
  unless utf8.valid_encoding? # if invalid UTF-8
15
15
  utf8 = utf8.force_encoding("ISO8859-15")
16
16
  utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
17
  utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
18
18
  utf8.force_encoding("UTF-8")
19
19
  end
20
-
21
- # normalize end of lines
22
- utf8.gsub!(/\r\n/u, "\n")
23
- utf8.gsub!(/\r/u, "\n")
24
-
25
- # normalize invisible chars
26
- utf8 = (utf8 << " ").split(/\n/u).each{|line|
27
- line.gsub!(/[\s\p{C}]/u, " ")
28
- }.join("\n").chop!
29
- utf8.force_encoding("UTF-8")
20
+ utf8
30
21
  else
31
-
32
- # for Ruby 1.8.6, use iconv
33
22
  require "iconv"
34
23
  utf8 << " "
35
- utf8 = begin
24
+ begin
36
25
  Iconv.new("UTF-8", "UTF-8").iconv(utf8)
37
26
  rescue
38
27
  utf8.gsub!(/\x80/n, "\xA4")
39
- utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
28
+ Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
40
29
  end
30
+ end
31
+ end
41
32
 
42
- # normalize end of lines
43
- utf8.gsub!(/\r\n/n, "\n")
44
- utf8.gsub!(/\r/n, "\n")
45
-
46
- # normalize invisible chars using oniguruma
33
+ def fix_endlines
34
+ gsub(/(?:\r\n|\r)/u, "\n")
35
+ end
36
+
37
+ SPECIAL_SPACES = [
38
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
39
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
40
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
41
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
42
+ 0x2028, # White_Space # Zl LINE SEPARATOR
43
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
44
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
45
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
46
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
47
+ ].flatten.collect{|e| [e].pack 'U*'}
48
+
49
+ def fix_invisible_chars
50
+ utf8 = self.dup
51
+ if utf8.respond_to?(:force_encoding)
52
+ utf8 = (utf8 << " ").split(/\n/u).each{|line|
53
+ line.gsub!(/[\s\p{C}]/u, " ")
54
+ }.join("\n").chop!
55
+ utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
56
+ utf8.force_encoding("UTF-8")
57
+ else
47
58
  require "oniguruma"
48
- utf8 = utf8.split(/\n/n).collect{|line|
59
+ utf8.split(/\n/n).collect{|line|
49
60
  Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
50
61
  }.join("\n").chop!
51
62
  end
52
- utf8
53
- replace(utf8)
63
+ end
64
+
65
+ def trim(chars = "")
66
+ chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
67
+ end
68
+
69
+ def to_permalink(separator="-")
70
+ fix_endlines.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
71
+ end
72
+
73
+ def nl2br
74
+ gsub("\n", "<br/>\n")
75
+ end
76
+
77
+ def to_nicer_sym
78
+ to_permalink("_").to_sym
79
+ end
80
+
81
+ def chartable(options = {})
82
+ options = {
83
+ :clean_binary => true,
84
+ :translit_symbols => true,
85
+ }.merge(options)
86
+ char = "%c"
87
+ table = {
88
+ "`" => "'", # dec = 96
89
+ "¦" => "|", # dec = 166, broken vertical bar
90
+ "¨" => "", # dec = 168, spacing diaeresis - umlaut
91
+ "ª" => "", # dec = 170, feminine ordinal indicator
92
+ "«" => "\"", # dec = 171, left double angle quotes
93
+ "¬" => "!", # dec = 172, not sign
94
+ "­" => "-", # dec = 173, soft hyphen
95
+ "¯" => "-", # dec = 175, spacing macron - overline
96
+ "²" => "2", # dec = 178, superscript two - squared
97
+ "³" => "3", # dec = 179, superscript three - cubed
98
+ "´" => "'", # dec = 180, acute accent - spacing acute
99
+ "·" => "", # dec = 183, middle dot - Georgian comma
100
+ "¸" => "", # dec = 184, spacing cedilla
101
+ "¹" => "1", # dec = 185, superscript one
102
+ "º" => "0", # dec = 186, masculine ordinal indicator
103
+ "»" => "\"", # dec = 187, right double angle quotes
104
+ "¿" => "", # dec = 191, inverted question mark
105
+ "Ý" => "Y", # dec = 221
106
+ "–" => "-", # hex = 2013, en dash
107
+ "—" => "-", # hex = 2014, em dash
108
+ "‚" => "'", # hex = 201A, single low-9 quotation mark
109
+ "„" => "\"", # hex = 201E, double low-9 quotation mark
110
+ }
111
+ if options[:clean_binary]
112
+ table[char % 0] = "" # null
113
+ table[char % 1] = "" # start of heading
114
+ table[char % 2] = "" # start of text
115
+ table[char % 3] = "" # end of text
116
+ table[char % 4] = "" # end of transmission
117
+ table[char % 5] = "" # enquiry
118
+ table[char % 6] = "" # acknowledge
119
+ table[char % 7] = "" # bell
120
+ table[char % 8] = "" # backspace
121
+ table[char % 9] = " " # tab
122
+ table[char % 11] = "" # vertical tab
123
+ table[char % 12] = "" # form feed
124
+ table[char % 14] = "" # shift out
125
+ table[char % 15] = "" # shift in
126
+ table[char % 16] = "" # data link escape
127
+ table[char % 17] = "" # device control 1
128
+ table[char % 18] = "" # device control 2
129
+ table[char % 19] = "" # device control 3
130
+ table[char % 20] = "" # device control 4
131
+ table[char % 21] = "" # negative acknowledgement
132
+ table[char % 22] = "" # synchronous idle
133
+ table[char % 23] = "" # end of transmission block
134
+ table[char % 24] = "" # cancel
135
+ table[char % 25] = "" # end of medium
136
+ table[char % 26] = "" # substitute
137
+ table[char % 27] = "" # escape
138
+ table[char % 28] = "" # file separator
139
+ table[char % 29] = "" # group separator
140
+ table[char % 30] = "" # record separator
141
+ table[char % 31] = "" # unit separator
142
+ table[char % 127] = "" # delete
143
+ end
144
+ if options[:translit_symbols]
145
+ table["$"] = " dollars " # dec = 36, dollar sign
146
+ table["%"] = " percent " # dec = 37, percent sign
147
+ table["&"] = " and " # dec = 38, ampersand
148
+ table["@"] = " at " # dec = 64, at symbol
149
+ table[char % 128] = " euros " # windows euro
150
+ table["¢"] = " cents " # dec = 162, cent sign
151
+ table["£"] = " pounds " # dec = 163, pound sign
152
+ table["¤"] = " euros " # dec = 164, currency sign
153
+ table["¥"] = " yens " # dec = 165, yen sign
154
+ table["§"] = " section " # dec = 167, section sign
155
+ table["©"] = " copyright " # dec = 169, copyright sign
156
+ table["®"] = " registered trademark " # dec = 174, registered trade mark sign
157
+ table["°"] = " degrees " # dec = 176, degree sign
158
+ table["±"] = " approx " # dec = 177, plus-or-minus sign
159
+ table["µ"] = " micro " # dec = 181, micro sign
160
+ table["¶"] = " paragraph " # dec = 182, pilcrow sign - paragraph sign
161
+ table["¼"] = " 1/4 " # dec = 188, fraction one quarter
162
+ table["½"] = " 1/2 " # dec = 189, fraction one half
163
+ table["¾"] = " 3/4 " # dec = 190, fraction three quarters
164
+ table["€"] = " euros " # hex = 20AC, unicode euro
165
+ table["™"] = " trademark " # hex = 2122, trade mark
166
+ end
167
+ table
54
168
  end
55
169
 
56
170
  end
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,2 @@
1
- require "spec"
2
- require "rubygems"
3
- require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")
1
+ require 'bundler/setup'
2
+ require 'string_cleaner'
@@ -19,6 +19,23 @@ describe String::Cleaner do
19
19
  @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
20
20
  end
21
21
  end
22
+ it "should convert all type of spaces to normal spaces" do
23
+ input = [
24
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
25
+ 0x0020, # White_Space # Zs SPACE
26
+ 0x0085, # White_Space # Cc <control-0085>
27
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
28
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
29
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
30
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
31
+ 0x2028, # White_Space # Zl LINE SEPARATOR
32
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
33
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
34
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
35
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
36
+ ].flatten.collect{ |e| [e].pack 'U*' }
37
+ input.join.clean.should == " \n \n "
38
+ end
22
39
  describe "with invalid UTF-8 sequence" do
23
40
  before :all do
24
41
  @input = "\210\004"
@@ -201,4 +218,54 @@ describe String::Cleaner do
201
218
  end
202
219
  end
203
220
  end
221
+ describe "#trim(chars = \"\")" do
222
+ it "should use #strip when used without params" do
223
+ string, expected = "", mock
224
+ string.stub(:strip).and_return expected
225
+ string.trim.should be expected
226
+ end
227
+ it "should remove multiple characters at once from beginning and end" do
228
+ prefix, suffix = " rhuif dww f f", "dqz qafdédsj iowe fcms. qpo asttt t dtt"
229
+ to_remove = "acdeéfhijmopqrstuwz "
230
+ "#{prefix}d#{suffix}".trim(to_remove).should eql "."
231
+ "#{prefix}D#{suffix}".trim(to_remove).should eql "Ddqz qafdédsj iowe fcms."
232
+ end
233
+ end
234
+ describe "#fix_endlines" do
235
+ it "should convert windows endlines" do
236
+ "this is a\r\ntest\r\n".fix_endlines.should eql "this is a\ntest\n"
237
+ end
238
+ it "should convert old mac endlines" do
239
+ "this is a\rtest\r".fix_endlines.should eql "this is a\ntest\n"
240
+ end
241
+ it "should not modify proper linux endlines" do
242
+ "this is a\ntest\n".fix_endlines.should eql "this is a\ntest\n"
243
+ end
244
+ it "should convert mixed endlines" do
245
+ "this is a\n\rtest\r\n".fix_endlines.should eql "this is a\n\ntest\n"
246
+ end
247
+ end
248
+ describe "#to_permalink(separator=\"-\")" do
249
+ it "should create nice permalink for string with many accents" do
250
+ crazy = " ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüý - Hello world, I'm a crazy string!! "
251
+ crazy.to_permalink.should == "aaaaaaceeeeiiiidnoooooxouuuuyaaaaaaceeeeiiiinoooooouuuuy-hello-world-i-m-a-crazy-string"
252
+ end
253
+ it "should create nice permalink even for evil string" do
254
+ evil = (128..255).inject(""){ |acc, b| acc += ("%c" % b) }
255
+ evil.to_permalink.should == "euros-cents-pounds-euros-yens-section-copyright-registered-trademark-degrees-approx-23-micro-paragraph-10-1-4-1-2-3-4-aaaaaaaeceeeeiiiidnoooooxouuuuythssaaaaaaaeceeeeiiiidnooooo-ouuuuythy"
256
+ end
257
+ it "should remove endlines too" do
258
+ "this\nis\ta\ntest".to_permalink("_").should eql "this_is_a_test"
259
+ end
260
+ end
261
+ describe "#nl2br" do
262
+ it "should convert \n to <br/>\n" do
263
+ "this\nis\ta\ntest\r".nl2br.should eql "this<br/>\nis\ta<br/>\ntest\r"
264
+ end
265
+ end
266
+ describe "#to_nicer_sym" do
267
+ it "should convert \"Select or Other\" to :select_or_other" do
268
+ "Select or Other".to_nicer_sym.should be :select_or_other
269
+ end
270
+ end
204
271
  end
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{string_cleaner}
5
- s.version = "0.1.0"
5
+ s.version = "0.2.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Joseph Halter"]
9
- s.date = %q{2009-08-26}
9
+ s.date = %q{2010-10-18}
10
10
  s.email = %q{joseph@openhood.com}
11
11
  s.extra_rdoc_files = [
12
12
  "LICENSE",
@@ -17,7 +17,6 @@ Gem::Specification.new do |s|
17
17
  "LICENSE",
18
18
  "README.rdoc",
19
19
  "Rakefile",
20
- "VERSION",
21
20
  "lib/string_cleaner.rb",
22
21
  "spec/spec_helper.rb",
23
22
  "spec/string_cleaner_spec.rb",
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
33
32
  "spec/spec_helper.rb",
34
33
  "spec/string_cleaner_spec.rb"
35
34
  ]
35
+ s.add_runtime_dependency "unidecoder"
36
+ s.add_development_dependency "rspec"
36
37
 
37
38
  if s.respond_to? :specification_version then
38
39
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_cleaner
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
- - 1
7
+ - 2
9
8
  - 0
10
- version: 0.1.0
9
+ version: 0.2.0
11
10
  platform: ruby
12
11
  authors:
13
12
  - Joseph Halter
@@ -15,10 +14,35 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2009-08-26 00:00:00 +02:00
17
+ date: 2010-10-18 00:00:00 +02:00
19
18
  default_executable:
20
- dependencies: []
21
-
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: unidecoder
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: rspec
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ version: "0"
44
+ type: :development
45
+ version_requirements: *id002
22
46
  description:
23
47
  email: joseph@openhood.com
24
48
  executables: []
@@ -33,7 +57,6 @@ files:
33
57
  - LICENSE
34
58
  - README.rdoc
35
59
  - Rakefile
36
- - VERSION
37
60
  - lib/string_cleaner.rb
38
61
  - spec/spec_helper.rb
39
62
  - spec/string_cleaner_spec.rb
@@ -52,7 +75,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
52
75
  requirements:
53
76
  - - ">="
54
77
  - !ruby/object:Gem::Version
55
- hash: 3
56
78
  segments:
57
79
  - 0
58
80
  version: "0"
@@ -61,7 +83,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
61
83
  requirements:
62
84
  - - ">="
63
85
  - !ruby/object:Gem::Version
64
- hash: 3
65
86
  segments:
66
87
  - 0
67
88
  version: "0"
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.1.0