data_kit 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/Gemfile.lock +1 -1
- data/lib/data_kit/converters/number.rb +5 -1
- data/lib/data_kit/csv/parser.rb +0 -1
- data/lib/data_kit/csv/schema_analyzer.rb +0 -1
- data/lib/data_kit/version.rb +1 -1
- data/spec/converters/number_spec.rb +5 -0
- data/spec/csv/converter_spec.rb +15 -0
- data/spec/csv/schema_analyzer_spec.rb +0 -3
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e62f0fdbbac2331ba30c312f2456e271d11ef544
|
4
|
+
data.tar.gz: 47f8462a6ee5e5d5e00e5a9d1d11b92b8ebc6506
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4403a287ef848eabea8074fcdc7739d28021c8efc86b314cbeb817e57e565f3a601049d10a72c89f21fb071b37b1a6ebedaa09c0919aa6ec27ccbeb90e88b7b5
|
7
|
+
data.tar.gz: c6f27fc161251157dbafb00b28913d3eeb39ec9cc8947233db21476e38afa737392daa1ca872b0288476668cbd6f82905dddc7e15f4199fe2a3bf48b0b74f2dd
|
data/Gemfile.lock
CHANGED
@@ -17,11 +17,15 @@ module DataKit
|
|
17
17
|
|
18
18
|
def reformat(value)
|
19
19
|
if value.is_a?(String)
|
20
|
-
value.gsub(/(\p{Sc}|\,)/, '')
|
20
|
+
value.encode('UTF-8', encoding_opts).gsub(/(\p{Sc}|\,)/, '')
|
21
21
|
else
|
22
22
|
value
|
23
23
|
end
|
24
24
|
end
|
25
|
+
|
26
|
+
def encoding_opts
|
27
|
+
{:invalid => :replace, :undef => :replace, :replace => '?'}
|
28
|
+
end
|
25
29
|
end
|
26
30
|
end
|
27
31
|
end
|
data/lib/data_kit/csv/parser.rb
CHANGED
data/lib/data_kit/version.rb
CHANGED
@@ -17,4 +17,9 @@ describe DataKit::Converters::Integer do
|
|
17
17
|
DataKit::Converters::Number.convert(reformatted).should == result
|
18
18
|
end
|
19
19
|
end
|
20
|
+
|
21
|
+
it "should reformat strings with unknown encodings" do
|
22
|
+
str = "9350 Waxie WayÊSuite"
|
23
|
+
DataKit::Converters::Number.reformat(str).should == str
|
24
|
+
end
|
20
25
|
end
|
data/spec/csv/converter_spec.rb
CHANGED
@@ -17,6 +17,10 @@ describe DataKit::CSV::Converter do
|
|
17
17
|
DataKit::CSV::Parser.new(data_path('utf8.csv'))
|
18
18
|
}
|
19
19
|
|
20
|
+
let(:asciicsv) {
|
21
|
+
DataKit::CSV::Parser.new(data_path('vc_backed_companies.csv'))
|
22
|
+
}
|
23
|
+
|
20
24
|
it "should initialize and execute" do
|
21
25
|
analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
|
22
26
|
converter = DataKit::CSV::Converter.new(csv, analysis, target)
|
@@ -47,4 +51,15 @@ describe DataKit::CSV::Converter do
|
|
47
51
|
CSV.open(target).each { |row| row_count += 1 }
|
48
52
|
row_count.should == 11
|
49
53
|
end
|
54
|
+
|
55
|
+
it "should convert rows with invalid UTF-8 characters" do
|
56
|
+
analysis = DataKit::CSV::SchemaAnalyzer.analyze(asciicsv, :sampling_rate => 1)
|
57
|
+
converter = DataKit::CSV::Converter.new(csv, analysis, target)
|
58
|
+
|
59
|
+
converter.execute
|
60
|
+
|
61
|
+
row_count = 0
|
62
|
+
CSV.open(target).each { |row| row_count += 1 }
|
63
|
+
row_count.should == 11
|
64
|
+
end
|
50
65
|
end
|
@@ -52,9 +52,6 @@ describe DataKit::CSV::SchemaAnalyzer do
|
|
52
52
|
analysis.row_count.should == 10
|
53
53
|
analysis.sample_count.should be < 10
|
54
54
|
analysis.use_type_hints.should == true
|
55
|
-
|
56
|
-
puts analysis.type_hints.inspect
|
57
|
-
|
58
55
|
end
|
59
56
|
|
60
57
|
it "should execute an analysis without type hints" do
|