gigo 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. data/.gitignore +17 -0
  2. data/Appraisals +17 -0
  3. data/Gemfile +3 -0
  4. data/LICENSE.txt +22 -0
  5. data/README.md +50 -0
  6. data/Rakefile +18 -0
  7. data/gemfiles/activesupport30.gemfile +7 -0
  8. data/gemfiles/activesupport30.gemfile.lock +29 -0
  9. data/gemfiles/activesupport31.gemfile +7 -0
  10. data/gemfiles/activesupport31.gemfile.lock +31 -0
  11. data/gemfiles/activesupport32.gemfile +7 -0
  12. data/gemfiles/activesupport32.gemfile.lock +33 -0
  13. data/gemfiles/activesupport40.gemfile +7 -0
  14. data/gemfiles/activesupport40.gemfile.lock +45 -0
  15. data/gigo.gemspec +24 -0
  16. data/lib/gigo.rb +34 -0
  17. data/lib/gigo/rchardet.rb +67 -0
  18. data/lib/gigo/rchardet/big5freq.rb +927 -0
  19. data/lib/gigo/rchardet/big5prober.rb +43 -0
  20. data/lib/gigo/rchardet/chardistribution.rb +238 -0
  21. data/lib/gigo/rchardet/charsetgroupprober.rb +113 -0
  22. data/lib/gigo/rchardet/charsetprober.rb +76 -0
  23. data/lib/gigo/rchardet/codingstatemachine.rb +66 -0
  24. data/lib/gigo/rchardet/constants.rb +43 -0
  25. data/lib/gigo/rchardet/escprober.rb +90 -0
  26. data/lib/gigo/rchardet/escsm.rb +245 -0
  27. data/lib/gigo/rchardet/eucjpprober.rb +89 -0
  28. data/lib/gigo/rchardet/euckrfreq.rb +598 -0
  29. data/lib/gigo/rchardet/euckrprober.rb +43 -0
  30. data/lib/gigo/rchardet/euctwfreq.rb +431 -0
  31. data/lib/gigo/rchardet/euctwprober.rb +43 -0
  32. data/lib/gigo/rchardet/gb2312freq.rb +475 -0
  33. data/lib/gigo/rchardet/gb2312prober.rb +43 -0
  34. data/lib/gigo/rchardet/hebrewprober.rb +291 -0
  35. data/lib/gigo/rchardet/jisfreq.rb +571 -0
  36. data/lib/gigo/rchardet/jpcntx.rb +230 -0
  37. data/lib/gigo/rchardet/langbulgarianmodel.rb +230 -0
  38. data/lib/gigo/rchardet/langcyrillicmodel.rb +331 -0
  39. data/lib/gigo/rchardet/langgreekmodel.rb +228 -0
  40. data/lib/gigo/rchardet/langhebrewmodel.rb +203 -0
  41. data/lib/gigo/rchardet/langhungarianmodel.rb +227 -0
  42. data/lib/gigo/rchardet/langthaimodel.rb +202 -0
  43. data/lib/gigo/rchardet/latin1prober.rb +148 -0
  44. data/lib/gigo/rchardet/mbcharsetprober.rb +91 -0
  45. data/lib/gigo/rchardet/mbcsgroupprober.rb +48 -0
  46. data/lib/gigo/rchardet/mbcssm.rb +543 -0
  47. data/lib/gigo/rchardet/sbcharsetprober.rb +125 -0
  48. data/lib/gigo/rchardet/sbcsgroupprober.rb +59 -0
  49. data/lib/gigo/rchardet/sjisprober.rb +89 -0
  50. data/lib/gigo/rchardet/universaldetector.rb +169 -0
  51. data/lib/gigo/rchardet/utf8prober.rb +87 -0
  52. data/lib/gigo/version.rb +3 -0
  53. data/test/cases/gigo_test.rb +58 -0
  54. data/test/support/minitest.rb +7 -0
  55. data/test/test_helper.rb +14 -0
  56. metadata +207 -0
@@ -0,0 +1,3 @@
1
+ module GIGO
2
+ VERSION = "1.0.0"
3
+ end
@@ -0,0 +1,58 @@
1
+ # -*- encoding: utf-8 -*-
2
+ require 'test_helper'
3
+
4
+ module GIGO
5
+ class BaseTest < TestCase
6
+
7
+ include ERB::Util
8
+
9
+ let(:data_utf8_emoji) { "💖" }
10
+ let(:data_utf8) { "€20 – “Woohoo”" }
11
+ let(:data_bad_readin) { "�20 � �Woohoo�" }
12
+ let(:data_cp1252) { data_utf8.encode('CP1252') }
13
+ let(:data_really_bad) { "ed.Ã\u0083Ã\u0083\xC3" }
14
+
15
+
16
+ describe '.load' do
17
+
18
+ it 'ignores if string is not present' do
19
+ GIGO.load('').must_equal ''
20
+ GIGO.load(nil).must_be_nil
21
+ o = Object.new
22
+ GIGO.load(o).must_equal o
23
+ end
24
+
25
+ it 'should allows properly encoded and marked strings to be passed thru' do
26
+ GIGO.load(data_utf8).must_equal data_utf8
27
+ GIGO.load(data_utf8_emoji).must_equal data_utf8_emoji
28
+ end
29
+
30
+ it 'allows data already read in with question marks to pass thru' do
31
+ GIGO.load(data_bad_readin).must_equal data_bad_readin
32
+ end
33
+
34
+ it 'allows really bad data to be encoded using default replace and question marks' do
35
+ GIGO.load(data_utf8_emoji.force_encoding('ASCII-8BIT')).must_equal '����'
36
+ end
37
+
38
+ it 'converts windows codepages that are poorly marked as another encoding' do
39
+ db_data1 = data_cp1252.dup.force_encoding('ASCII-8BIT')
40
+ GIGO.load(db_data1).must_equal data_utf8
41
+ db_data2 = data_cp1252.dup.force_encoding('US-ASCII')
42
+ GIGO.load(db_data2).must_equal data_utf8
43
+ db_data3 = data_cp1252.dup.force_encoding('UTF-8')
44
+ GIGO.load(db_data3).must_equal data_utf8
45
+ db_data4 = data_cp1252.dup
46
+ GIGO.load(db_data4).must_equal data_utf8
47
+ end
48
+
49
+ it 'can make sure to it is really a valid encoding afteward' do
50
+ html_escape GIGO.load(data_really_bad)
51
+ end
52
+
53
+ end
54
+
55
+
56
+
57
+ end
58
+ end
@@ -0,0 +1,7 @@
1
+ require 'minitest/emoji'
2
+
3
+ if ENV['CI']
4
+ MiniTest::Emoji::DEFAULT.merge! '.' => ".", 'F' => "F", 'E' => "E", 'S' => "S"
5
+ else
6
+ MiniTest::Emoji::DEFAULT.merge! '.' => "\u{1f49A} ", 'F' => "\u{1f494} ", 'E' => "\u{1f480} ", 'S' => "\u{1f49B} "
7
+ end
@@ -0,0 +1,14 @@
1
+ require 'bundler'
2
+ require 'minitest/autorun'
3
+ Bundler.require :development, :test
4
+ require 'gigo'
5
+ require 'support/minitest'
6
+ require 'erb'
7
+
8
+ module GIGO
9
+ class TestCase < MiniTest::Spec
10
+
11
+
12
+
13
+ end
14
+ end
metadata ADDED
@@ -0,0 +1,207 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: gigo
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Ken Collins
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2013-03-26 00:00:00.000000000 Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: activesupport
16
+ requirement: !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ! '>='
20
+ - !ruby/object:Gem::Version
21
+ version: '3.0'
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: !ruby/object:Gem::Requirement
25
+ none: false
26
+ requirements:
27
+ - - ! '>='
28
+ - !ruby/object:Gem::Version
29
+ version: '3.0'
30
+ - !ruby/object:Gem::Dependency
31
+ name: ensure_valid_encoding
32
+ requirement: !ruby/object:Gem::Requirement
33
+ none: false
34
+ requirements:
35
+ - - ~>
36
+ - !ruby/object:Gem::Version
37
+ version: 0.5.3
38
+ type: :runtime
39
+ prerelease: false
40
+ version_requirements: !ruby/object:Gem::Requirement
41
+ none: false
42
+ requirements:
43
+ - - ~>
44
+ - !ruby/object:Gem::Version
45
+ version: 0.5.3
46
+ - !ruby/object:Gem::Dependency
47
+ name: appraisal
48
+ requirement: !ruby/object:Gem::Requirement
49
+ none: false
50
+ requirements:
51
+ - - ! '>='
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ type: :development
55
+ prerelease: false
56
+ version_requirements: !ruby/object:Gem::Requirement
57
+ none: false
58
+ requirements:
59
+ - - ! '>='
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ - !ruby/object:Gem::Dependency
63
+ name: rake
64
+ requirement: !ruby/object:Gem::Requirement
65
+ none: false
66
+ requirements:
67
+ - - ! '>='
68
+ - !ruby/object:Gem::Version
69
+ version: '0'
70
+ type: :development
71
+ prerelease: false
72
+ version_requirements: !ruby/object:Gem::Requirement
73
+ none: false
74
+ requirements:
75
+ - - ! '>='
76
+ - !ruby/object:Gem::Version
77
+ version: '0'
78
+ - !ruby/object:Gem::Dependency
79
+ name: minitest
80
+ requirement: !ruby/object:Gem::Requirement
81
+ none: false
82
+ requirements:
83
+ - - ! '>='
84
+ - !ruby/object:Gem::Version
85
+ version: '0'
86
+ type: :development
87
+ prerelease: false
88
+ version_requirements: !ruby/object:Gem::Requirement
89
+ none: false
90
+ requirements:
91
+ - - ! '>='
92
+ - !ruby/object:Gem::Version
93
+ version: '0'
94
+ - !ruby/object:Gem::Dependency
95
+ name: minitest-emoji
96
+ requirement: !ruby/object:Gem::Requirement
97
+ none: false
98
+ requirements:
99
+ - - ! '>='
100
+ - !ruby/object:Gem::Version
101
+ version: '0'
102
+ type: :development
103
+ prerelease: false
104
+ version_requirements: !ruby/object:Gem::Requirement
105
+ none: false
106
+ requirements:
107
+ - - ! '>='
108
+ - !ruby/object:Gem::Version
109
+ version: '0'
110
+ description: Garbage in, garbage out. Fix ruby encoded strings at all costs.
111
+ email:
112
+ - kcollins@customink.com
113
+ executables: []
114
+ extensions: []
115
+ extra_rdoc_files: []
116
+ files:
117
+ - .gitignore
118
+ - Appraisals
119
+ - Gemfile
120
+ - LICENSE.txt
121
+ - README.md
122
+ - Rakefile
123
+ - gemfiles/activesupport30.gemfile
124
+ - gemfiles/activesupport30.gemfile.lock
125
+ - gemfiles/activesupport31.gemfile
126
+ - gemfiles/activesupport31.gemfile.lock
127
+ - gemfiles/activesupport32.gemfile
128
+ - gemfiles/activesupport32.gemfile.lock
129
+ - gemfiles/activesupport40.gemfile
130
+ - gemfiles/activesupport40.gemfile.lock
131
+ - gigo.gemspec
132
+ - lib/gigo.rb
133
+ - lib/gigo/rchardet.rb
134
+ - lib/gigo/rchardet/big5freq.rb
135
+ - lib/gigo/rchardet/big5prober.rb
136
+ - lib/gigo/rchardet/chardistribution.rb
137
+ - lib/gigo/rchardet/charsetgroupprober.rb
138
+ - lib/gigo/rchardet/charsetprober.rb
139
+ - lib/gigo/rchardet/codingstatemachine.rb
140
+ - lib/gigo/rchardet/constants.rb
141
+ - lib/gigo/rchardet/escprober.rb
142
+ - lib/gigo/rchardet/escsm.rb
143
+ - lib/gigo/rchardet/eucjpprober.rb
144
+ - lib/gigo/rchardet/euckrfreq.rb
145
+ - lib/gigo/rchardet/euckrprober.rb
146
+ - lib/gigo/rchardet/euctwfreq.rb
147
+ - lib/gigo/rchardet/euctwprober.rb
148
+ - lib/gigo/rchardet/gb2312freq.rb
149
+ - lib/gigo/rchardet/gb2312prober.rb
150
+ - lib/gigo/rchardet/hebrewprober.rb
151
+ - lib/gigo/rchardet/jisfreq.rb
152
+ - lib/gigo/rchardet/jpcntx.rb
153
+ - lib/gigo/rchardet/langbulgarianmodel.rb
154
+ - lib/gigo/rchardet/langcyrillicmodel.rb
155
+ - lib/gigo/rchardet/langgreekmodel.rb
156
+ - lib/gigo/rchardet/langhebrewmodel.rb
157
+ - lib/gigo/rchardet/langhungarianmodel.rb
158
+ - lib/gigo/rchardet/langthaimodel.rb
159
+ - lib/gigo/rchardet/latin1prober.rb
160
+ - lib/gigo/rchardet/mbcharsetprober.rb
161
+ - lib/gigo/rchardet/mbcsgroupprober.rb
162
+ - lib/gigo/rchardet/mbcssm.rb
163
+ - lib/gigo/rchardet/sbcharsetprober.rb
164
+ - lib/gigo/rchardet/sbcsgroupprober.rb
165
+ - lib/gigo/rchardet/sjisprober.rb
166
+ - lib/gigo/rchardet/universaldetector.rb
167
+ - lib/gigo/rchardet/utf8prober.rb
168
+ - lib/gigo/version.rb
169
+ - test/cases/gigo_test.rb
170
+ - test/support/minitest.rb
171
+ - test/test_helper.rb
172
+ homepage: http://github.com/customink/gigo
173
+ licenses: []
174
+ post_install_message:
175
+ rdoc_options: []
176
+ require_paths:
177
+ - lib
178
+ required_ruby_version: !ruby/object:Gem::Requirement
179
+ none: false
180
+ requirements:
181
+ - - ! '>='
182
+ - !ruby/object:Gem::Version
183
+ version: '0'
184
+ segments:
185
+ - 0
186
+ hash: -1651701888858574901
187
+ required_rubygems_version: !ruby/object:Gem::Requirement
188
+ none: false
189
+ requirements:
190
+ - - ! '>='
191
+ - !ruby/object:Gem::Version
192
+ version: '0'
193
+ segments:
194
+ - 0
195
+ hash: -1651701888858574901
196
+ requirements: []
197
+ rubyforge_project:
198
+ rubygems_version: 1.8.25
199
+ signing_key:
200
+ specification_version: 3
201
+ summary: The gigo gem aims to solve bad data, likely from a legacy database. It is
202
+ an anti-pattern and you should really consider standardizing what encoding you both
203
+ put in and take out of your data stores.
204
+ test_files:
205
+ - test/cases/gigo_test.rb
206
+ - test/support/minitest.rb
207
+ - test/test_helper.rb