data_kit 0.0.8 → 0.0.9
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/data_kit/converters/number.rb +5 -1
- data/lib/data_kit/csv/parser.rb +0 -1
- data/lib/data_kit/csv/schema_analyzer.rb +0 -1
- data/lib/data_kit/version.rb +1 -1
- data/spec/converters/number_spec.rb +5 -0
- data/spec/csv/converter_spec.rb +15 -0
- data/spec/csv/schema_analyzer_spec.rb +0 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e62f0fdbbac2331ba30c312f2456e271d11ef544
|
4
|
+
data.tar.gz: 47f8462a6ee5e5d5e00e5a9d1d11b92b8ebc6506
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4403a287ef848eabea8074fcdc7739d28021c8efc86b314cbeb817e57e565f3a601049d10a72c89f21fb071b37b1a6ebedaa09c0919aa6ec27ccbeb90e88b7b5
|
7
|
+
data.tar.gz: c6f27fc161251157dbafb00b28913d3eeb39ec9cc8947233db21476e38afa737392daa1ca872b0288476668cbd6f82905dddc7e15f4199fe2a3bf48b0b74f2dd
|
data/Gemfile.lock
CHANGED
@@ -17,11 +17,15 @@ module DataKit
|
|
17
17
|
|
18
18
|
def reformat(value)
|
19
19
|
if value.is_a?(String)
|
20
|
-
value.gsub(/(\p{Sc}|\,)/, '')
|
20
|
+
value.encode('UTF-8', encoding_opts).gsub(/(\p{Sc}|\,)/, '')
|
21
21
|
else
|
22
22
|
value
|
23
23
|
end
|
24
24
|
end
|
25
|
+
|
26
|
+
def encoding_opts
|
27
|
+
{:invalid => :replace, :undef => :replace, :replace => '?'}
|
28
|
+
end
|
25
29
|
end
|
26
30
|
end
|
27
31
|
end
|
data/lib/data_kit/csv/parser.rb
CHANGED
data/lib/data_kit/version.rb
CHANGED
@@ -17,4 +17,9 @@ describe DataKit::Converters::Integer do
|
|
17
17
|
DataKit::Converters::Number.convert(reformatted).should == result
|
18
18
|
end
|
19
19
|
end
|
20
|
+
|
21
|
+
it "should reformat strings with unknown encodings" do
|
22
|
+
str = "9350 Waxie WayÊSuite"
|
23
|
+
DataKit::Converters::Number.reformat(str).should == str
|
24
|
+
end
|
20
25
|
end
|
data/spec/csv/converter_spec.rb
CHANGED
@@ -17,6 +17,10 @@ describe DataKit::CSV::Converter do
|
|
17
17
|
DataKit::CSV::Parser.new(data_path('utf8.csv'))
|
18
18
|
}
|
19
19
|
|
20
|
+
let(:asciicsv) {
|
21
|
+
DataKit::CSV::Parser.new(data_path('vc_backed_companies.csv'))
|
22
|
+
}
|
23
|
+
|
20
24
|
it "should initialize and execute" do
|
21
25
|
analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
|
22
26
|
converter = DataKit::CSV::Converter.new(csv, analysis, target)
|
@@ -47,4 +51,15 @@ describe DataKit::CSV::Converter do
|
|
47
51
|
CSV.open(target).each { |row| row_count += 1 }
|
48
52
|
row_count.should == 11
|
49
53
|
end
|
54
|
+
|
55
|
+
it "should convert rows with invalid UTF-8 characters" do
|
56
|
+
analysis = DataKit::CSV::SchemaAnalyzer.analyze(asciicsv, :sampling_rate => 1)
|
57
|
+
converter = DataKit::CSV::Converter.new(csv, analysis, target)
|
58
|
+
|
59
|
+
converter.execute
|
60
|
+
|
61
|
+
row_count = 0
|
62
|
+
CSV.open(target).each { |row| row_count += 1 }
|
63
|
+
row_count.should == 11
|
64
|
+
end
|
50
65
|
end
|
@@ -52,9 +52,6 @@ describe DataKit::CSV::SchemaAnalyzer do
|
|
52
52
|
analysis.row_count.should == 10
|
53
53
|
analysis.sample_count.should be < 10
|
54
54
|
analysis.use_type_hints.should == true
|
55
|
-
|
56
|
-
puts analysis.type_hints.inspect
|
57
|
-
|
58
55
|
end
|
59
56
|
|
60
57
|
it "should execute an analysis without type hints" do
|