gigo 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (56) hide show
  1. data/.gitignore +17 -0
  2. data/Appraisals +17 -0
  3. data/Gemfile +3 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +50 -0
  6. data/Rakefile +18 -0
  7. data/gemfiles/activesupport30.gemfile +7 -0
  8. data/gemfiles/activesupport30.gemfile.lock +29 -0
  9. data/gemfiles/activesupport31.gemfile +7 -0
  10. data/gemfiles/activesupport31.gemfile.lock +31 -0
  11. data/gemfiles/activesupport32.gemfile +7 -0
  12. data/gemfiles/activesupport32.gemfile.lock +33 -0
  13. data/gemfiles/activesupport40.gemfile +7 -0
  14. data/gemfiles/activesupport40.gemfile.lock +45 -0
  15. data/gigo.gemspec +24 -0
  16. data/lib/gigo.rb +34 -0
  17. data/lib/gigo/rchardet.rb +67 -0
  18. data/lib/gigo/rchardet/big5freq.rb +927 -0
  19. data/lib/gigo/rchardet/big5prober.rb +43 -0
  20. data/lib/gigo/rchardet/chardistribution.rb +238 -0
  21. data/lib/gigo/rchardet/charsetgroupprober.rb +113 -0
  22. data/lib/gigo/rchardet/charsetprober.rb +76 -0
  23. data/lib/gigo/rchardet/codingstatemachine.rb +66 -0
  24. data/lib/gigo/rchardet/constants.rb +43 -0
  25. data/lib/gigo/rchardet/escprober.rb +90 -0
  26. data/lib/gigo/rchardet/escsm.rb +245 -0
  27. data/lib/gigo/rchardet/eucjpprober.rb +89 -0
  28. data/lib/gigo/rchardet/euckrfreq.rb +598 -0
  29. data/lib/gigo/rchardet/euckrprober.rb +43 -0
  30. data/lib/gigo/rchardet/euctwfreq.rb +431 -0
  31. data/lib/gigo/rchardet/euctwprober.rb +43 -0
  32. data/lib/gigo/rchardet/gb2312freq.rb +475 -0
  33. data/lib/gigo/rchardet/gb2312prober.rb +43 -0
  34. data/lib/gigo/rchardet/hebrewprober.rb +291 -0
  35. data/lib/gigo/rchardet/jisfreq.rb +571 -0
  36. data/lib/gigo/rchardet/jpcntx.rb +230 -0
  37. data/lib/gigo/rchardet/langbulgarianmodel.rb +230 -0
  38. data/lib/gigo/rchardet/langcyrillicmodel.rb +331 -0
  39. data/lib/gigo/rchardet/langgreekmodel.rb +228 -0
  40. data/lib/gigo/rchardet/langhebrewmodel.rb +203 -0
  41. data/lib/gigo/rchardet/langhungarianmodel.rb +227 -0
  42. data/lib/gigo/rchardet/langthaimodel.rb +202 -0
  43. data/lib/gigo/rchardet/latin1prober.rb +148 -0
  44. data/lib/gigo/rchardet/mbcharsetprober.rb +91 -0
  45. data/lib/gigo/rchardet/mbcsgroupprober.rb +48 -0
  46. data/lib/gigo/rchardet/mbcssm.rb +543 -0
  47. data/lib/gigo/rchardet/sbcharsetprober.rb +125 -0
  48. data/lib/gigo/rchardet/sbcsgroupprober.rb +59 -0
  49. data/lib/gigo/rchardet/sjisprober.rb +89 -0
  50. data/lib/gigo/rchardet/universaldetector.rb +169 -0
  51. data/lib/gigo/rchardet/utf8prober.rb +87 -0
  52. data/lib/gigo/version.rb +3 -0
  53. data/test/cases/gigo_test.rb +58 -0
  54. data/test/support/minitest.rb +7 -0
  55. data/test/test_helper.rb +14 -0
  56. metadata +207 -0
@@ -0,0 +1,3 @@
1
+ module GIGO
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,58 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ module GIGO
5
+ class BaseTest < TestCase
6
+
7
+ include ERB::Util
8
+
9
+ let(:data_utf8_emoji) { "💖" }
10
+ let(:data_utf8) { "€20 – “Woohoo”" }
11
+ let(:data_bad_readin) { "�20 � �Woohoo�" }
12
+ let(:data_cp1252) { data_utf8.encode('CP1252') }
13
+ let(:data_really_bad) { "ed.Ã\u0083Ã\u0083\xC3" }
14
+
15
+
16
+ describe '.load' do
17
+
18
+ it 'ignores if string is not present' do
19
+ GIGO.load('').must_equal ''
20
+ GIGO.load(nil).must_be_nil
21
+ o = Object.new
22
+ GIGO.load(o).must_equal o
23
+ end
24
+
25
+ it 'should allows properly encoded and marked strings to be passed thru' do
26
+ GIGO.load(data_utf8).must_equal data_utf8
27
+ GIGO.load(data_utf8_emoji).must_equal data_utf8_emoji
28
+ end
29
+
30
+ it 'allows data already read in with question marks to pass thru' do
31
+ GIGO.load(data_bad_readin).must_equal data_bad_readin
32
+ end
33
+
34
+ it 'allows really bad data to be encoded using default replace and question marks' do
35
+ GIGO.load(data_utf8_emoji.force_encoding('ASCII-8BIT')).must_equal '����'
36
+ end
37
+
38
+ it 'converts windows codepages that are poorly marked as another encoding' do
39
+ db_data1 = data_cp1252.dup.force_encoding('ASCII-8BIT')
40
+ GIGO.load(db_data1).must_equal data_utf8
41
+ db_data2 = data_cp1252.dup.force_encoding('US-ASCII')
42
+ GIGO.load(db_data2).must_equal data_utf8
43
+ db_data3 = data_cp1252.dup.force_encoding('UTF-8')
44
+ GIGO.load(db_data3).must_equal data_utf8
45
+ db_data4 = data_cp1252.dup
46
+ GIGO.load(db_data4).must_equal data_utf8
47
+ end
48
+
49
+ it 'can make sure to it is really a valid encoding afteward' do
50
+ html_escape GIGO.load(data_really_bad)
51
+ end
52
+
53
+ end
54
+
55
+
56
+
57
+ end
58
+ end
@@ -0,0 +1,7 @@
1
+ require 'minitest/emoji'
2
+
3
+ if ENV['CI']
4
+ MiniTest::Emoji::DEFAULT.merge! '.' => ".", 'F' => "F", 'E' => "E", 'S' => "S"
5
+ else
6
+ MiniTest::Emoji::DEFAULT.merge! '.' => "\u{1f49A} ", 'F' => "\u{1f494} ", 'E' => "\u{1f480} ", 'S' => "\u{1f49B} "
7
+ end
@@ -0,0 +1,14 @@
1
+ require 'bundler'
2
+ require 'minitest/autorun'
3
+ Bundler.require :development, :test
4
+ require 'gigo'
5
+ require 'support/minitest'
6
+ require 'erb'
7
+
8
+ module GIGO
9
+ class TestCase < MiniTest::Spec
10
+
11
+
12
+
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,207 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gigo
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ken Collins
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activesupport
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '3.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '3.0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: ensure_valid_encoding
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 0.5.3
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 0.5.3
46
+ - !ruby/object:Gem::Dependency
47
+ name: appraisal
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: minitest
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: minitest-emoji
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Garbage in, garbage out. Fix ruby encoded strings at all costs.
111
+ email:
112
+ - kcollins@customink.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - .gitignore
118
+ - Appraisals
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - gemfiles/activesupport30.gemfile
124
+ - gemfiles/activesupport30.gemfile.lock
125
+ - gemfiles/activesupport31.gemfile
126
+ - gemfiles/activesupport31.gemfile.lock
127
+ - gemfiles/activesupport32.gemfile
128
+ - gemfiles/activesupport32.gemfile.lock
129
+ - gemfiles/activesupport40.gemfile
130
+ - gemfiles/activesupport40.gemfile.lock
131
+ - gigo.gemspec
132
+ - lib/gigo.rb
133
+ - lib/gigo/rchardet.rb
134
+ - lib/gigo/rchardet/big5freq.rb
135
+ - lib/gigo/rchardet/big5prober.rb
136
+ - lib/gigo/rchardet/chardistribution.rb
137
+ - lib/gigo/rchardet/charsetgroupprober.rb
138
+ - lib/gigo/rchardet/charsetprober.rb
139
+ - lib/gigo/rchardet/codingstatemachine.rb
140
+ - lib/gigo/rchardet/constants.rb
141
+ - lib/gigo/rchardet/escprober.rb
142
+ - lib/gigo/rchardet/escsm.rb
143
+ - lib/gigo/rchardet/eucjpprober.rb
144
+ - lib/gigo/rchardet/euckrfreq.rb
145
+ - lib/gigo/rchardet/euckrprober.rb
146
+ - lib/gigo/rchardet/euctwfreq.rb
147
+ - lib/gigo/rchardet/euctwprober.rb
148
+ - lib/gigo/rchardet/gb2312freq.rb
149
+ - lib/gigo/rchardet/gb2312prober.rb
150
+ - lib/gigo/rchardet/hebrewprober.rb
151
+ - lib/gigo/rchardet/jisfreq.rb
152
+ - lib/gigo/rchardet/jpcntx.rb
153
+ - lib/gigo/rchardet/langbulgarianmodel.rb
154
+ - lib/gigo/rchardet/langcyrillicmodel.rb
155
+ - lib/gigo/rchardet/langgreekmodel.rb
156
+ - lib/gigo/rchardet/langhebrewmodel.rb
157
+ - lib/gigo/rchardet/langhungarianmodel.rb
158
+ - lib/gigo/rchardet/langthaimodel.rb
159
+ - lib/gigo/rchardet/latin1prober.rb
160
+ - lib/gigo/rchardet/mbcharsetprober.rb
161
+ - lib/gigo/rchardet/mbcsgroupprober.rb
162
+ - lib/gigo/rchardet/mbcssm.rb
163
+ - lib/gigo/rchardet/sbcharsetprober.rb
164
+ - lib/gigo/rchardet/sbcsgroupprober.rb
165
+ - lib/gigo/rchardet/sjisprober.rb
166
+ - lib/gigo/rchardet/universaldetector.rb
167
+ - lib/gigo/rchardet/utf8prober.rb
168
+ - lib/gigo/version.rb
169
+ - test/cases/gigo_test.rb
170
+ - test/support/minitest.rb
171
+ - test/test_helper.rb
172
+ homepage: http://github.com/customink/gigo
173
+ licenses: []
174
+ post_install_message:
175
+ rdoc_options: []
176
+ require_paths:
177
+ - lib
178
+ required_ruby_version: !ruby/object:Gem::Requirement
179
+ none: false
180
+ requirements:
181
+ - - ! '>='
182
+ - !ruby/object:Gem::Version
183
+ version: '0'
184
+ segments:
185
+ - 0
186
+ hash: -1651701888858574901
187
+ required_rubygems_version: !ruby/object:Gem::Requirement
188
+ none: false
189
+ requirements:
190
+ - - ! '>='
191
+ - !ruby/object:Gem::Version
192
+ version: '0'
193
+ segments:
194
+ - 0
195
+ hash: -1651701888858574901
196
+ requirements: []
197
+ rubyforge_project:
198
+ rubygems_version: 1.8.25
199
+ signing_key:
200
+ specification_version: 3
201
+ summary: The gigo gem aims to solve bad data, likely from a legacy database. It is
202
+ an anti-pattern and you should really consider standardizing what encoding you both
203
+ put in and take out of your data stores.
204
+ test_files:
205
+ - test/cases/gigo_test.rb
206
+ - test/support/minitest.rb
207
+ - test/test_helper.rb