string_cleaner 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore CHANGED
@@ -1,5 +1,6 @@
1
- *.sw?
1
+ .bundle
2
2
  .DS_Store
3
3
  coverage
4
4
  rdoc
5
5
  pkg
6
+ *.gem
data/Rakefile CHANGED
@@ -1,46 +1,26 @@
1
- require "rubygems"
2
- require "rake"
3
-
1
+ require 'rubygems'
4
2
  begin
5
- require "jeweler"
6
- Jeweler::Tasks.new do |gem|
7
- gem.name = "string_cleaner"
8
- gem.summary = %Q{TODO}
9
- gem.email = "joseph@openhood.com"
10
- gem.homepage = "http://github.com/JosephHalter/string_cleaner"
11
- gem.authors = ["Joseph Halter"]
12
-
13
- # gem is a Gem::Specification... see http://www.rubygems.org/read/chapter/20 for additional settings
14
- end
3
+ require 'bundler/setup'
15
4
  rescue LoadError
16
- puts "Jeweler not available. Install it with: sudo gem install technicalpickles-jeweler -s http://gems.github.com"
5
+ puts 'You must `gem install bundler` and `bundle install` to run rake tasks'
17
6
  end
18
7
 
19
- require "spec/rake/spectask"
20
- Spec::Rake::SpecTask.new(:spec) do |spec|
21
- spec.libs << "lib" << "spec"
22
- spec.spec_files = FileList["spec/**/*_spec.rb"]
8
+ require 'rspec/core/rake_task'
9
+ RSpec::Core::RakeTask.new(:spec) do |spec|
10
+ spec.pattern = 'spec/**/*_spec.rb'
23
11
  end
24
12
 
25
- Spec::Rake::SpecTask.new(:rcov) do |spec|
26
- spec.libs << "lib" << "spec"
27
- spec.pattern = "spec/**/*_spec.rb"
13
+ RSpec::Core::RakeTask.new(:rcov) do |spec|
14
+ spec.pattern = 'spec/**/*_spec.rb'
28
15
  spec.rcov = true
29
16
  end
30
17
 
31
18
  task :default => :spec
32
19
 
33
- require "rake/rdoctask"
20
+ require 'rake/rdoctask'
34
21
  Rake::RDocTask.new do |rdoc|
35
- if File.exist?("VERSION.yml")
36
- config = YAML.load(File.read("VERSION.yml"))
37
- version = "#{config[:major]}.#{config[:minor]}.#{config[:patch]}"
38
- else
39
- version = ""
40
- end
41
-
42
- rdoc.rdoc_dir = "rdoc"
43
- rdoc.title = "test #{version}"
44
- rdoc.rdoc_files.include("README*")
45
- rdoc.rdoc_files.include("lib/**/*.rb")
46
- end
22
+ rdoc.rdoc_dir = 'rdoc'
23
+ rdoc.title = "string_cleaner"
24
+ rdoc.rdoc_files.include('README*')
25
+ rdoc.rdoc_files.include('lib/**/*.rb')
26
+ end
@@ -1,56 +1,170 @@
1
1
  # encoding: UTF-8
2
+ require "unidecoder"
3
+
2
4
  module String::Cleaner
3
5
 
4
- # convert invalid UTF-8 strings from ISO-8859-15 to UTF-8 to fix them
5
- # recognize euro char from both ISO 8859-15 and Windows-1252
6
- # replace \r\n and \r with \n normalizing end of lines
7
- # replace control characters and other invisible chars by spaces
8
6
  def clean
9
- utf8 = self.dup
10
- if utf8.respond_to?(:force_encoding)
7
+ fix_encoding.fix_endlines.fix_invisible_chars
8
+ end
11
9
 
12
- # for Ruby 1.9+
13
- utf8.force_encoding("UTF-8")
10
+ def fix_encoding
11
+ utf8 = dup
12
+ if utf8.respond_to?(:force_encoding)
13
+ utf8.force_encoding("UTF-8") # for Ruby 1.9+
14
14
  unless utf8.valid_encoding? # if invalid UTF-8
15
15
  utf8 = utf8.force_encoding("ISO8859-15")
16
16
  utf8.encode!("UTF-8", :invalid => :replace, :undef => :replace, :replace => "")
17
17
  utf8.gsub!("\xC2\x80", "€") # special case for euro sign from Windows-1252
18
18
  utf8.force_encoding("UTF-8")
19
19
  end
20
-
21
- # normalize end of lines
22
- utf8.gsub!(/\r\n/u, "\n")
23
- utf8.gsub!(/\r/u, "\n")
24
-
25
- # normalize invisible chars
26
- utf8 = (utf8 << " ").split(/\n/u).each{|line|
27
- line.gsub!(/[\s\p{C}]/u, " ")
28
- }.join("\n").chop!
29
- utf8.force_encoding("UTF-8")
20
+ utf8
30
21
  else
31
-
32
- # for Ruby 1.8.6, use iconv
33
22
  require "iconv"
34
23
  utf8 << " "
35
- utf8 = begin
24
+ begin
36
25
  Iconv.new("UTF-8", "UTF-8").iconv(utf8)
37
26
  rescue
38
27
  utf8.gsub!(/\x80/n, "\xA4")
39
- utf8 = Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
28
+ Iconv.new("UTF-8//IGNORE", "ISO8859-15").iconv(utf8)
40
29
  end
30
+ end
31
+ end
41
32
 
42
- # normalize end of lines
43
- utf8.gsub!(/\r\n/n, "\n")
44
- utf8.gsub!(/\r/n, "\n")
45
-
46
- # normalize invisible chars using oniguruma
33
+ def fix_endlines
34
+ gsub(/(?:\r\n|\r)/u, "\n")
35
+ end
36
+
37
+ SPECIAL_SPACES = [
38
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
39
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
40
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
41
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
42
+ 0x2028, # White_Space # Zl LINE SEPARATOR
43
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
44
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
45
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
46
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
47
+ ].flatten.collect{|e| [e].pack 'U*'}
48
+
49
+ def fix_invisible_chars
50
+ utf8 = self.dup
51
+ if utf8.respond_to?(:force_encoding)
52
+ utf8 = (utf8 << " ").split(/\n/u).each{|line|
53
+ line.gsub!(/[\s\p{C}]/u, " ")
54
+ }.join("\n").chop!
55
+ utf8.gsub!(Regexp.new(SPECIAL_SPACES.join("|")), " ")
56
+ utf8.force_encoding("UTF-8")
57
+ else
47
58
  require "oniguruma"
48
- utf8 = utf8.split(/\n/n).collect{|line|
59
+ utf8.split(/\n/n).collect{|line|
49
60
  Oniguruma::ORegexp.new("[\\s\\p{C}]", {:encoding => Oniguruma::ENCODING_UTF8}).gsub(line, " ")
50
61
  }.join("\n").chop!
51
62
  end
52
- utf8
53
- replace(utf8)
63
+ end
64
+
65
+ def trim(chars = "")
66
+ chars.size>0 ? gsub(/\A[#{chars}]+|[#{chars}]+\z/, "") : strip
67
+ end
68
+
69
+ def to_permalink(separator="-")
70
+ fix_endlines.to_ascii(chartable).downcase.gsub(/[^a-z0-9]+/, separator).trim(separator)
71
+ end
72
+
73
+ def nl2br
74
+ gsub("\n", "<br/>\n")
75
+ end
76
+
77
+ def to_nicer_sym
78
+ to_permalink("_").to_sym
79
+ end
80
+
81
+ def chartable(options = {})
82
+ options = {
83
+ :clean_binary => true,
84
+ :translit_symbols => true,
85
+ }.merge(options)
86
+ char = "%c"
87
+ table = {
88
+ "`" => "'", # dec = 96
89
+ "¦" => "|", # dec = 166, broken vertical bar
90
+ "¨" => "", # dec = 168, spacing diaeresis - umlaut
91
+ "ª" => "", # dec = 170, feminine ordinal indicator
92
+ "«" => "\"", # dec = 171, left double angle quotes
93
+ "¬" => "!", # dec = 172, not sign
94
+ "­" => "-", # dec = 173, soft hyphen
95
+ "¯" => "-", # dec = 175, spacing macron - overline
96
+ "²" => "2", # dec = 178, superscript two - squared
97
+ "³" => "3", # dec = 179, superscript three - cubed
98
+ "´" => "'", # dec = 180, acute accent - spacing acute
99
+ "·" => "", # dec = 183, middle dot - Georgian comma
100
+ "¸" => "", # dec = 184, spacing cedilla
101
+ "¹" => "1", # dec = 185, superscript one
102
+ "º" => "0", # dec = 186, masculine ordinal indicator
103
+ "»" => "\"", # dec = 187, right double angle quotes
104
+ "¿" => "", # dec = 191, inverted question mark
105
+ "Ý" => "Y", # dec = 221
106
+ "–" => "-", # hex = 2013, en dash
107
+ "—" => "-", # hex = 2014, em dash
108
+ "‚" => "'", # hex = 201A, single low-9 quotation mark
109
+ "„" => "\"", # hex = 201E, double low-9 quotation mark
110
+ }
111
+ if options[:clean_binary]
112
+ table[char % 0] = "" # null
113
+ table[char % 1] = "" # start of heading
114
+ table[char % 2] = "" # start of text
115
+ table[char % 3] = "" # end of text
116
+ table[char % 4] = "" # end of transmission
117
+ table[char % 5] = "" # enquiry
118
+ table[char % 6] = "" # acknowledge
119
+ table[char % 7] = "" # bell
120
+ table[char % 8] = "" # backspace
121
+ table[char % 9] = " " # tab
122
+ table[char % 11] = "" # vertical tab
123
+ table[char % 12] = "" # form feed
124
+ table[char % 14] = "" # shift out
125
+ table[char % 15] = "" # shift in
126
+ table[char % 16] = "" # data link escape
127
+ table[char % 17] = "" # device control 1
128
+ table[char % 18] = "" # device control 2
129
+ table[char % 19] = "" # device control 3
130
+ table[char % 20] = "" # device control 4
131
+ table[char % 21] = "" # negative acknowledgement
132
+ table[char % 22] = "" # synchronous idle
133
+ table[char % 23] = "" # end of transmission block
134
+ table[char % 24] = "" # cancel
135
+ table[char % 25] = "" # end of medium
136
+ table[char % 26] = "" # substitute
137
+ table[char % 27] = "" # escape
138
+ table[char % 28] = "" # file separator
139
+ table[char % 29] = "" # group separator
140
+ table[char % 30] = "" # record separator
141
+ table[char % 31] = "" # unit separator
142
+ table[char % 127] = "" # delete
143
+ end
144
+ if options[:translit_symbols]
145
+ table["$"] = " dollars " # dec = 36, dollar sign
146
+ table["%"] = " percent " # dec = 37, percent sign
147
+ table["&"] = " and " # dec = 38, ampersand
148
+ table["@"] = " at " # dec = 64, at symbol
149
+ table[char % 128] = " euros " # windows euro
150
+ table["¢"] = " cents " # dec = 162, cent sign
151
+ table["£"] = " pounds " # dec = 163, pound sign
152
+ table["¤"] = " euros " # dec = 164, currency sign
153
+ table["¥"] = " yens " # dec = 165, yen sign
154
+ table["§"] = " section " # dec = 167, section sign
155
+ table["©"] = " copyright " # dec = 169, copyright sign
156
+ table["®"] = " registered trademark " # dec = 174, registered trade mark sign
157
+ table["°"] = " degrees " # dec = 176, degree sign
158
+ table["±"] = " approx " # dec = 177, plus-or-minus sign
159
+ table["µ"] = " micro " # dec = 181, micro sign
160
+ table["¶"] = " paragraph " # dec = 182, pilcrow sign - paragraph sign
161
+ table["¼"] = " 1/4 " # dec = 188, fraction one quarter
162
+ table["½"] = " 1/2 " # dec = 189, fraction one half
163
+ table["¾"] = " 3/4 " # dec = 190, fraction three quarters
164
+ table["€"] = " euros " # hex = 20AC, unicode euro
165
+ table["™"] = " trademark " # hex = 2122, trade mark
166
+ end
167
+ table
54
168
  end
55
169
 
56
170
  end
data/spec/spec_helper.rb CHANGED
@@ -1,3 +1,2 @@
1
- require "spec"
2
- require "rubygems"
3
- require File.join(File.dirname(__FILE__), "..", "lib", "string_cleaner")
1
+ require 'bundler/setup'
2
+ require 'string_cleaner'
@@ -19,6 +19,23 @@ describe String::Cleaner do
19
19
  @output.should == " \n \n !\"\#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~ € ¡¢£€¥Š§š©ª«¬ ®¯°±²³Žµ¶·ž¹º»ŒœŸ¿ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ"
20
20
  end
21
21
  end
22
+ it "should convert all type of spaces to normal spaces" do
23
+ input = [
24
+ (0x0009..0x000D).to_a, # White_Space # Cc [5] <control-0009>..<control-000D>
25
+ 0x0020, # White_Space # Zs SPACE
26
+ 0x0085, # White_Space # Cc <control-0085>
27
+ 0x00A0, # White_Space # Zs NO-BREAK SPACE
28
+ 0x1680, # White_Space # Zs OGHAM SPACE MARK
29
+ 0x180E, # White_Space # Zs MONGOLIAN VOWEL SEPARATOR
30
+ (0x2000..0x200A).to_a, # White_Space # Zs [11] EN QUAD..HAIR SPACE
31
+ 0x2028, # White_Space # Zl LINE SEPARATOR
32
+ 0x2029, # White_Space # Zp PARAGRAPH SEPARATOR
33
+ 0x202F, # White_Space # Zs NARROW NO-BREAK SPACE
34
+ 0x205F, # White_Space # Zs MEDIUM MATHEMATICAL SPACE
35
+ 0x3000, # White_Space # Zs IDEOGRAPHIC SPACE
36
+ ].flatten.collect{ |e| [e].pack 'U*' }
37
+ input.join.clean.should == " \n \n "
38
+ end
22
39
  describe "with invalid UTF-8 sequence" do
23
40
  before :all do
24
41
  @input = "\210\004"
@@ -201,4 +218,54 @@ describe String::Cleaner do
201
218
  end
202
219
  end
203
220
  end
221
+ describe "#trim(chars = \"\")" do
222
+ it "should use #strip when used without params" do
223
+ string, expected = "", mock
224
+ string.stub(:strip).and_return expected
225
+ string.trim.should be expected
226
+ end
227
+ it "should remove multiple characters at once from beginning and end" do
228
+ prefix, suffix = " rhuif dww f f", "dqz qafdédsj iowe fcms. qpo asttt t dtt"
229
+ to_remove = "acdeéfhijmopqrstuwz "
230
+ "#{prefix}d#{suffix}".trim(to_remove).should eql "."
231
+ "#{prefix}D#{suffix}".trim(to_remove).should eql "Ddqz qafdédsj iowe fcms."
232
+ end
233
+ end
234
+ describe "#fix_endlines" do
235
+ it "should convert windows endlines" do
236
+ "this is a\r\ntest\r\n".fix_endlines.should eql "this is a\ntest\n"
237
+ end
238
+ it "should convert old mac endlines" do
239
+ "this is a\rtest\r".fix_endlines.should eql "this is a\ntest\n"
240
+ end
241
+ it "should not modify proper linux endlines" do
242
+ "this is a\ntest\n".fix_endlines.should eql "this is a\ntest\n"
243
+ end
244
+ it "should convert mixed endlines" do
245
+ "this is a\n\rtest\r\n".fix_endlines.should eql "this is a\n\ntest\n"
246
+ end
247
+ end
248
+ describe "#to_permalink(separator=\"-\")" do
249
+ it "should create nice permalink for string with many accents" do
250
+ crazy = " ÀÁÂÃÄÅÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖרÙÚÛÜÝàáâãäåçèéêëìíîïñòóôõöøùúûüý - Hello world, I'm a crazy string!! "
251
+ crazy.to_permalink.should == "aaaaaaceeeeiiiidnoooooxouuuuyaaaaaaceeeeiiiinoooooouuuuy-hello-world-i-m-a-crazy-string"
252
+ end
253
+ it "should create nice permalink even for evil string" do
254
+ evil = (128..255).inject(""){ |acc, b| acc += ("%c" % b) }
255
+ evil.to_permalink.should == "euros-cents-pounds-euros-yens-section-copyright-registered-trademark-degrees-approx-23-micro-paragraph-10-1-4-1-2-3-4-aaaaaaaeceeeeiiiidnoooooxouuuuythssaaaaaaaeceeeeiiiidnooooo-ouuuuythy"
256
+ end
257
+ it "should remove endlines too" do
258
+ "this\nis\ta\ntest".to_permalink("_").should eql "this_is_a_test"
259
+ end
260
+ end
261
+ describe "#nl2br" do
262
+ it "should convert \n to <br/>\n" do
263
+ "this\nis\ta\ntest\r".nl2br.should eql "this<br/>\nis\ta<br/>\ntest\r"
264
+ end
265
+ end
266
+ describe "#to_nicer_sym" do
267
+ it "should convert \"Select or Other\" to :select_or_other" do
268
+ "Select or Other".to_nicer_sym.should be :select_or_other
269
+ end
270
+ end
204
271
  end
@@ -2,11 +2,11 @@
2
2
 
3
3
  Gem::Specification.new do |s|
4
4
  s.name = %q{string_cleaner}
5
- s.version = "0.1.0"
5
+ s.version = "0.2.0"
6
6
 
7
7
  s.required_rubygems_version = Gem::Requirement.new(">= 0") if s.respond_to? :required_rubygems_version=
8
8
  s.authors = ["Joseph Halter"]
9
- s.date = %q{2009-08-26}
9
+ s.date = %q{2010-10-18}
10
10
  s.email = %q{joseph@openhood.com}
11
11
  s.extra_rdoc_files = [
12
12
  "LICENSE",
@@ -17,7 +17,6 @@ Gem::Specification.new do |s|
17
17
  "LICENSE",
18
18
  "README.rdoc",
19
19
  "Rakefile",
20
- "VERSION",
21
20
  "lib/string_cleaner.rb",
22
21
  "spec/spec_helper.rb",
23
22
  "spec/string_cleaner_spec.rb",
@@ -33,6 +32,8 @@ Gem::Specification.new do |s|
33
32
  "spec/spec_helper.rb",
34
33
  "spec/string_cleaner_spec.rb"
35
34
  ]
35
+ s.add_runtime_dependency "unidecoder"
36
+ s.add_development_dependency "rspec"
36
37
 
37
38
  if s.respond_to? :specification_version then
38
39
  current_version = Gem::Specification::CURRENT_SPECIFICATION_VERSION
metadata CHANGED
@@ -1,13 +1,12 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: string_cleaner
3
3
  version: !ruby/object:Gem::Version
4
- hash: 27
5
4
  prerelease: false
6
5
  segments:
7
6
  - 0
8
- - 1
7
+ - 2
9
8
  - 0
10
- version: 0.1.0
9
+ version: 0.2.0
11
10
  platform: ruby
12
11
  authors:
13
12
  - Joseph Halter
@@ -15,10 +14,35 @@ autorequire:
15
14
  bindir: bin
16
15
  cert_chain: []
17
16
 
18
- date: 2009-08-26 00:00:00 +02:00
17
+ date: 2010-10-18 00:00:00 +02:00
19
18
  default_executable:
20
- dependencies: []
21
-
19
+ dependencies:
20
+ - !ruby/object:Gem::Dependency
21
+ name: unidecoder
22
+ prerelease: false
23
+ requirement: &id001 !ruby/object:Gem::Requirement
24
+ none: false
25
+ requirements:
26
+ - - ">="
27
+ - !ruby/object:Gem::Version
28
+ segments:
29
+ - 0
30
+ version: "0"
31
+ type: :runtime
32
+ version_requirements: *id001
33
+ - !ruby/object:Gem::Dependency
34
+ name: rspec
35
+ prerelease: false
36
+ requirement: &id002 !ruby/object:Gem::Requirement
37
+ none: false
38
+ requirements:
39
+ - - ">="
40
+ - !ruby/object:Gem::Version
41
+ segments:
42
+ - 0
43
+ version: "0"
44
+ type: :development
45
+ version_requirements: *id002
22
46
  description:
23
47
  email: joseph@openhood.com
24
48
  executables: []
@@ -33,7 +57,6 @@ files:
33
57
  - LICENSE
34
58
  - README.rdoc
35
59
  - Rakefile
36
- - VERSION
37
60
  - lib/string_cleaner.rb
38
61
  - spec/spec_helper.rb
39
62
  - spec/string_cleaner_spec.rb
@@ -52,7 +75,6 @@ required_ruby_version: !ruby/object:Gem::Requirement
52
75
  requirements:
53
76
  - - ">="
54
77
  - !ruby/object:Gem::Version
55
- hash: 3
56
78
  segments:
57
79
  - 0
58
80
  version: "0"
@@ -61,7 +83,6 @@ required_rubygems_version: !ruby/object:Gem::Requirement
61
83
  requirements:
62
84
  - - ">="
63
85
  - !ruby/object:Gem::Version
64
- hash: 3
65
86
  segments:
66
87
  - 0
67
88
  version: "0"
data/VERSION DELETED
@@ -1 +0,0 @@
1
- 0.1.0