gigo 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. data/.gitignore +17 -0
  2. data/Appraisals +17 -0
  3. data/Gemfile +3 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +50 -0
  6. data/Rakefile +18 -0
  7. data/gemfiles/activesupport30.gemfile +7 -0
  8. data/gemfiles/activesupport30.gemfile.lock +29 -0
  9. data/gemfiles/activesupport31.gemfile +7 -0
  10. data/gemfiles/activesupport31.gemfile.lock +31 -0
  11. data/gemfiles/activesupport32.gemfile +7 -0
  12. data/gemfiles/activesupport32.gemfile.lock +33 -0
  13. data/gemfiles/activesupport40.gemfile +7 -0
  14. data/gemfiles/activesupport40.gemfile.lock +45 -0
  15. data/gigo.gemspec +24 -0
  16. data/lib/gigo.rb +34 -0
  17. data/lib/gigo/rchardet.rb +67 -0
  18. data/lib/gigo/rchardet/big5freq.rb +927 -0
  19. data/lib/gigo/rchardet/big5prober.rb +43 -0
  20. data/lib/gigo/rchardet/chardistribution.rb +238 -0
  21. data/lib/gigo/rchardet/charsetgroupprober.rb +113 -0
  22. data/lib/gigo/rchardet/charsetprober.rb +76 -0
  23. data/lib/gigo/rchardet/codingstatemachine.rb +66 -0
  24. data/lib/gigo/rchardet/constants.rb +43 -0
  25. data/lib/gigo/rchardet/escprober.rb +90 -0
  26. data/lib/gigo/rchardet/escsm.rb +245 -0
  27. data/lib/gigo/rchardet/eucjpprober.rb +89 -0
  28. data/lib/gigo/rchardet/euckrfreq.rb +598 -0
  29. data/lib/gigo/rchardet/euckrprober.rb +43 -0
  30. data/lib/gigo/rchardet/euctwfreq.rb +431 -0
  31. data/lib/gigo/rchardet/euctwprober.rb +43 -0
  32. data/lib/gigo/rchardet/gb2312freq.rb +475 -0
  33. data/lib/gigo/rchardet/gb2312prober.rb +43 -0
  34. data/lib/gigo/rchardet/hebrewprober.rb +291 -0
  35. data/lib/gigo/rchardet/jisfreq.rb +571 -0
  36. data/lib/gigo/rchardet/jpcntx.rb +230 -0
  37. data/lib/gigo/rchardet/langbulgarianmodel.rb +230 -0
  38. data/lib/gigo/rchardet/langcyrillicmodel.rb +331 -0
  39. data/lib/gigo/rchardet/langgreekmodel.rb +228 -0
  40. data/lib/gigo/rchardet/langhebrewmodel.rb +203 -0
  41. data/lib/gigo/rchardet/langhungarianmodel.rb +227 -0
  42. data/lib/gigo/rchardet/langthaimodel.rb +202 -0
  43. data/lib/gigo/rchardet/latin1prober.rb +148 -0
  44. data/lib/gigo/rchardet/mbcharsetprober.rb +91 -0
  45. data/lib/gigo/rchardet/mbcsgroupprober.rb +48 -0
  46. data/lib/gigo/rchardet/mbcssm.rb +543 -0
  47. data/lib/gigo/rchardet/sbcharsetprober.rb +125 -0
  48. data/lib/gigo/rchardet/sbcsgroupprober.rb +59 -0
  49. data/lib/gigo/rchardet/sjisprober.rb +89 -0
  50. data/lib/gigo/rchardet/universaldetector.rb +169 -0
  51. data/lib/gigo/rchardet/utf8prober.rb +87 -0
  52. data/lib/gigo/version.rb +3 -0
  53. data/test/cases/gigo_test.rb +58 -0
  54. data/test/support/minitest.rb +7 -0
  55. data/test/test_helper.rb +14 -0
  56. metadata +207 -0
data/.gitignore ADDED
@@ -0,0 +1,17 @@
1
+ *.gem
2
+ *.rbc
3
+ .bundle
4
+ .config
5
+ .yardoc
6
+ Gemfile.lock
7
+ InstalledFiles
8
+ _yardoc
9
+ coverage
10
+ doc/
11
+ lib/bundler/man
12
+ pkg
13
+ rdoc
14
+ spec/reports
15
+ test/tmp
16
+ test/version_tmp
17
+ tmp
data/Appraisals ADDED
@@ -0,0 +1,17 @@
1
+
2
+ appraise 'activesupport30' do
3
+ gem 'activesupport', '~> 3.0.0'
4
+ end
5
+
6
+ appraise 'activesupport31' do
7
+ gem 'activesupport', '~> 3.1.0'
8
+ end
9
+
10
+ appraise 'activesupport32' do
11
+ gem 'activesupport', '~> 3.2.0'
12
+ end
13
+
14
+ appraise 'activesupport40' do
15
+ gem 'activesupport', :github => 'rails/rails'
16
+ end
17
+
data/Gemfile ADDED
@@ -0,0 +1,3 @@
1
+ source 'https://rubygems.org'
2
+
3
+ gemspec
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2013 Ken Collins
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+
2
+ # GIGO (Garbage In, Garbage Out)
3
+
4
+ Or better yet, Garbage In, Gold Out! - The GIGO gem aims to fix ruby string encodings at all costs!
5
+
6
+ The GIGO gem is not likely the proper solutions. If you have bad encodings in your database, you should fix them and write consistent encodings. That said, if you have no other choice, GIGO can help.
7
+
8
+ This gem depends on one of the many public forks of `CharDet` for ruby. Since `CharDet` is not a public gem and following proper semantic versioning, we have decided to vendor the [kirillrdy/rchardet](http://github.com/kirillrdy/rchardet) repo. We have even made sure that our vendored version stays in our namesacpe by using `GIGO::CharDet`. So if you have another version bundled, feel confident that the two will not conflict.
9
+
10
+ We use `GIGO::CharDet` to do the grunt work of finding the proper encoding of an untrusted string. Once found, we use the [EnsureValidEncoding](http://github.com/jrochkind/ensure_valid_encoding) gem to either force an encoding while removing any non-convertable characters.
11
+
12
+
13
+ ## Usage
14
+
15
+ Simple, just pass a string to `GIGO.load`. Nil values or properly encoded strings are returned. Else, `GIGO` will do its best to convert and force your default internal (or UTF-8) encoding.
16
+
17
+ ```ruby
18
+ GIGO.load "€20 – “Woohoo”"
19
+ ```
20
+
21
+ Lets say you have a `comments` column on an ActiveRecord model which is not guaranteed to come back per your default external encoding.
22
+
23
+ ```ruby
24
+ def comments
25
+ GIGO.load read_attribute(:comments)
26
+ end
27
+ ```
28
+
29
+
30
+ ## Toe Dough List
31
+
32
+ Remvoe CharDet and look at something like [CharlockHolmes](https://github.com/brianmario/charlock_holmes). I had install problems with this and it also failed a few initial tire kicks. See [my notes](https://gist.github.com/metaskills/5029604) here on the topic.
33
+
34
+
35
+ ## Contributing
36
+
37
+ GIGO is fully tested with ActiveSupport 3.0 to 4 and upward. If you detect a problem, open up a github issue or fork the repo and help out. After you fork or clone the repository, the following commands will get you up and running on the test suite.
38
+
39
+ ```shell
40
+ $ bundle install
41
+ $ bundle exec rake appraisal:setup
42
+ $ bundle exec rake appraisal test
43
+ ```
44
+
45
+ We use the [appraisal](https://github.com/thoughtbot/appraisal) gem from Thoughtbot to help us generate the individual gemfiles for each ActiveSupport version and to run the tests locally against each generated Gemfile. The `rake appraisal test` command actually runs our test suite against all Rails versions in our `Appraisal` file. If you want to run the tests for a specific Rails version, use `rake -T` for a list. For example, the following command will run the tests for Rails 3.2 only.
46
+
47
+ ```shell
48
+ $ bundle exec rake appraisal:rails32 test
49
+ ```
50
+
data/Rakefile ADDED
@@ -0,0 +1,18 @@
1
+ require 'bundler/gem_tasks'
2
+ require 'rake/testtask'
3
+ require 'appraisal'
4
+
5
+ Rake::TestTask.new do |t|
6
+ t.libs = ['lib','test']
7
+ t.test_files = Dir.glob('test/**/*_test.rb').sort
8
+ t.verbose = true
9
+ end
10
+
11
+ task :default => :test
12
+
13
+ desc "Setup Appraisal."
14
+ task 'appraisal:setup' do
15
+ Rake::Task['appraisal:cleanup'].invoke
16
+ Rake::Task['appraisal:gemfiles'].invoke
17
+ Rake::Task['appraisal:install'].invoke
18
+ end
@@ -0,0 +1,7 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gem "activesupport", "~> 3.0.0"
6
+
7
+ gemspec :path=>"../"
@@ -0,0 +1,29 @@
1
+ PATH
2
+ remote: /Users/kencollins/Repositories/gigo
3
+ specs:
4
+ gigo (1.0.0)
5
+ activesupport (>= 3.0)
6
+ ensure_valid_encoding (~> 0.5.3)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ activesupport (3.0.20)
12
+ appraisal (0.5.1)
13
+ bundler
14
+ rake
15
+ ensure_valid_encoding (0.5.3)
16
+ minitest (4.6.1)
17
+ minitest-emoji (1.0.0)
18
+ rake (10.0.3)
19
+
20
+ PLATFORMS
21
+ ruby
22
+
23
+ DEPENDENCIES
24
+ activesupport (~> 3.0.0)
25
+ appraisal
26
+ gigo!
27
+ minitest
28
+ minitest-emoji
29
+ rake
@@ -0,0 +1,7 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gem "activesupport", "~> 3.1.0"
6
+
7
+ gemspec :path=>"../"
@@ -0,0 +1,31 @@
1
+ PATH
2
+ remote: /Users/kencollins/Repositories/gigo
3
+ specs:
4
+ gigo (1.0.0)
5
+ activesupport (>= 3.0)
6
+ ensure_valid_encoding (~> 0.5.3)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ activesupport (3.1.10)
12
+ multi_json (>= 1.0, < 1.3)
13
+ appraisal (0.5.1)
14
+ bundler
15
+ rake
16
+ ensure_valid_encoding (0.5.3)
17
+ minitest (4.6.1)
18
+ minitest-emoji (1.0.0)
19
+ multi_json (1.2.0)
20
+ rake (10.0.3)
21
+
22
+ PLATFORMS
23
+ ruby
24
+
25
+ DEPENDENCIES
26
+ activesupport (~> 3.1.0)
27
+ appraisal
28
+ gigo!
29
+ minitest
30
+ minitest-emoji
31
+ rake
@@ -0,0 +1,7 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gem "activesupport", "~> 3.2.0"
6
+
7
+ gemspec :path=>"../"
@@ -0,0 +1,33 @@
1
+ PATH
2
+ remote: /Users/kencollins/Repositories/gigo
3
+ specs:
4
+ gigo (1.0.0)
5
+ activesupport (>= 3.0)
6
+ ensure_valid_encoding (~> 0.5.3)
7
+
8
+ GEM
9
+ remote: https://rubygems.org/
10
+ specs:
11
+ activesupport (3.2.12)
12
+ i18n (~> 0.6)
13
+ multi_json (~> 1.0)
14
+ appraisal (0.5.1)
15
+ bundler
16
+ rake
17
+ ensure_valid_encoding (0.5.3)
18
+ i18n (0.6.1)
19
+ minitest (4.6.1)
20
+ minitest-emoji (1.0.0)
21
+ multi_json (1.6.1)
22
+ rake (10.0.3)
23
+
24
+ PLATFORMS
25
+ ruby
26
+
27
+ DEPENDENCIES
28
+ activesupport (~> 3.2.0)
29
+ appraisal
30
+ gigo!
31
+ minitest
32
+ minitest-emoji
33
+ rake
@@ -0,0 +1,7 @@
1
+ # This file was generated by Appraisal
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gem "activesupport", :github=>"rails/rails"
6
+
7
+ gemspec :path=>"../"
@@ -0,0 +1,45 @@
1
+ GIT
2
+ remote: git://github.com/rails/rails.git
3
+ revision: 23048551fdb679e694d8245e9f8c969ed7a71f40
4
+ specs:
5
+ activesupport (4.0.0.beta)
6
+ i18n (~> 0.6)
7
+ minitest (~> 4.2)
8
+ multi_json (~> 1.3)
9
+ thread_safe (~> 0.1)
10
+ tzinfo (~> 0.3.33)
11
+
12
+ PATH
13
+ remote: /Users/kencollins/Repositories/gigo
14
+ specs:
15
+ gigo (1.0.0)
16
+ activesupport (>= 3.0)
17
+ ensure_valid_encoding (~> 0.5.3)
18
+
19
+ GEM
20
+ remote: https://rubygems.org/
21
+ specs:
22
+ appraisal (0.5.1)
23
+ bundler
24
+ rake
25
+ atomic (1.0.1)
26
+ ensure_valid_encoding (0.5.3)
27
+ i18n (0.6.1)
28
+ minitest (4.6.1)
29
+ minitest-emoji (1.0.0)
30
+ multi_json (1.6.1)
31
+ rake (10.0.3)
32
+ thread_safe (0.1.0)
33
+ atomic
34
+ tzinfo (0.3.35)
35
+
36
+ PLATFORMS
37
+ ruby
38
+
39
+ DEPENDENCIES
40
+ activesupport!
41
+ appraisal
42
+ gigo!
43
+ minitest
44
+ minitest-emoji
45
+ rake
data/gigo.gemspec ADDED
@@ -0,0 +1,24 @@
1
+ # -*- encoding: utf-8 -*-
2
+ lib = File.expand_path('../lib', __FILE__)
3
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
4
+ require 'gigo/version'
5
+
6
+ Gem::Specification.new do |gem|
7
+ gem.name = 'gigo'
8
+ gem.version = GIGO::VERSION
9
+ gem.authors = ["Ken Collins"]
10
+ gem.email = ["kcollins@customink.com"]
11
+ gem.description = 'Garbage in, garbage out. Fix ruby encoded strings at all costs.'
12
+ gem.summary = 'The gigo gem aims to solve bad data, likely from a legacy database. It is an anti-pattern and you should really consider standardizing what encoding you both put in and take out of your data stores.'
13
+ gem.homepage = 'http://github.com/customink/gigo'
14
+ gem.files = `git ls-files`.split($/)
15
+ gem.executables = gem.files.grep(%r{^bin/}).map{ |f| File.basename(f) }
16
+ gem.test_files = gem.files.grep(%r{^(test|spec|features)/})
17
+ gem.require_paths = ["lib"]
18
+ gem.add_runtime_dependency 'activesupport', '>= 3.0'
19
+ gem.add_runtime_dependency 'ensure_valid_encoding', '~> 0.5.3'
20
+ gem.add_development_dependency 'appraisal'
21
+ gem.add_development_dependency 'rake'
22
+ gem.add_development_dependency 'minitest'
23
+ gem.add_development_dependency 'minitest-emoji'
24
+ end
data/lib/gigo.rb ADDED
@@ -0,0 +1,34 @@
1
+ require 'active_support/core_ext/object/acts_like'
2
+ require 'active_support/core_ext/string/behavior'
3
+ require 'ensure_valid_encoding'
4
+ require 'gigo/rchardet'
5
+ require 'gigo/version'
6
+
7
+ module GIGO
8
+
9
+ def self.load(data)
10
+ return data if data.nil? || !data.acts_like?(:string)
11
+ encoded_string = safe_detect_and_encoder(data)
12
+ return data if data.encoding == forced_encoding && data == encoded_string
13
+ encoded_string
14
+ end
15
+
16
+
17
+ protected
18
+
19
+ def self.safe_detect_and_encoder(data)
20
+ string = data
21
+ begin
22
+ encoding = CharDet.detect(string.dup)['encoding'] || string.encoding || Encoding.default_internal || forced_encoding
23
+ string = string.force_encoding(encoding).encode forced_encoding, :undef => :replace, :invalid => :replace
24
+ rescue Exception => e
25
+ string = string.encode forced_encoding, :undef => :replace, :invalid => :replace
26
+ end
27
+ EnsureValidEncoding.ensure_valid_encoding string, invalid: :replace, replace: "?"
28
+ end
29
+
30
+ def self.forced_encoding
31
+ Encoding.default_internal || Encoding::UTF_8
32
+ end
33
+
34
+ end
@@ -0,0 +1,67 @@
1
+ ######################## BEGIN LICENSE BLOCK ########################
2
+ # This library is free software; you can redistribute it and/or
3
+ # modify it under the terms of the GNU Lesser General Public
4
+ # License as published by the Free Software Foundation; either
5
+ # version 2.1 of the License, or (at your option) any later version.
6
+ #
7
+ # This library is distributed in the hope that it will be useful,
8
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
9
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10
+ # Lesser General Public License for more details.
11
+ #
12
+ # You should have received a copy of the GNU Lesser General Public
13
+ # License along with this library; if not, write to the Free Software
14
+ # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
15
+ # 02110-1301 USA
16
+ ######################### END LICENSE BLOCK #########################
17
+
18
+ require 'gigo/rchardet/charsetprober'
19
+ require 'gigo/rchardet/mbcharsetprober'
20
+
21
+ require 'gigo/rchardet/big5freq'
22
+ require 'gigo/rchardet/big5prober'
23
+ require 'gigo/rchardet/chardistribution'
24
+ require 'gigo/rchardet/charsetgroupprober'
25
+
26
+ require 'gigo/rchardet/codingstatemachine'
27
+ require 'gigo/rchardet/constants'
28
+ require 'gigo/rchardet/escprober'
29
+ require 'gigo/rchardet/escsm'
30
+ require 'gigo/rchardet/eucjpprober'
31
+ require 'gigo/rchardet/euckrfreq'
32
+ require 'gigo/rchardet/euckrprober'
33
+ require 'gigo/rchardet/euctwfreq'
34
+ require 'gigo/rchardet/euctwprober'
35
+ require 'gigo/rchardet/gb2312freq'
36
+ require 'gigo/rchardet/gb2312prober'
37
+ require 'gigo/rchardet/hebrewprober'
38
+ require 'gigo/rchardet/jisfreq'
39
+ require 'gigo/rchardet/jpcntx'
40
+ require 'gigo/rchardet/langbulgarianmodel'
41
+ require 'gigo/rchardet/langcyrillicmodel'
42
+ require 'gigo/rchardet/langgreekmodel'
43
+ require 'gigo/rchardet/langhebrewmodel'
44
+ require 'gigo/rchardet/langhungarianmodel'
45
+ require 'gigo/rchardet/langthaimodel'
46
+ require 'gigo/rchardet/latin1prober'
47
+
48
+ require 'gigo/rchardet/mbcsgroupprober'
49
+ require 'gigo/rchardet/mbcssm'
50
+ require 'gigo/rchardet/sbcharsetprober'
51
+ require 'gigo/rchardet/sbcsgroupprober'
52
+ require 'gigo/rchardet/sjisprober'
53
+ require 'gigo/rchardet/universaldetector'
54
+ require 'gigo/rchardet/utf8prober'
55
+
56
+ module GIGO
57
+ module CharDet
58
+ VERSION = "1.3"
59
+ def CharDet.detect(aBuf)
60
+ u = UniversalDetector.new
61
+ u.reset
62
+ u.feed(aBuf)
63
+ u.close
64
+ u.result
65
+ end
66
+ end
67
+ end