data_kit 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: a079b89a65b3f20db63058e070d6b12d19db436f
4
- data.tar.gz: 78868c7945412096389eca3d05e15a495d73f08b
3
+ metadata.gz: e62f0fdbbac2331ba30c312f2456e271d11ef544
4
+ data.tar.gz: 47f8462a6ee5e5d5e00e5a9d1d11b92b8ebc6506
5
5
  SHA512:
6
- metadata.gz: f1a80f34c222c89ee9af223796c06d50087dfddd9def99d4d1b6edd4c650ec6fd0f1dc8770dbf5c8f6c62433a8987e4f157e945ac9d0b188507a6e51feb67562
7
- data.tar.gz: 8f4c7dac994fe348c69eba8b3b2a62105c33f8f4565efdc2effd3a4f41b3582f34b0b8ed8d86b89fa9a1986e1f152147563967f43e32426d255c5fb910728ec5
6
+ metadata.gz: 4403a287ef848eabea8074fcdc7739d28021c8efc86b314cbeb817e57e565f3a601049d10a72c89f21fb071b37b1a6ebedaa09c0919aa6ec27ccbeb90e88b7b5
7
+ data.tar.gz: c6f27fc161251157dbafb00b28913d3eeb39ec9cc8947233db21476e38afa737392daa1ca872b0288476668cbd6f82905dddc7e15f4199fe2a3bf48b0b74f2dd
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- data_kit (0.0.8)
4
+ data_kit (0.0.9)
5
5
  rcsv
6
6
  timeliness
7
7
 
@@ -17,11 +17,15 @@ module DataKit
17
17
 
18
18
  def reformat(value)
19
19
  if value.is_a?(String)
20
- value.gsub(/(\p{Sc}|\,)/, '')
20
+ value.encode('UTF-8', encoding_opts).gsub(/(\p{Sc}|\,)/, '')
21
21
  else
22
22
  value
23
23
  end
24
24
  end
25
+
26
+ def encoding_opts
27
+ {:invalid => :replace, :undef => :replace, :replace => '?'}
28
+ end
25
29
  end
26
30
  end
27
31
  end
@@ -17,7 +17,6 @@ module DataKit
17
17
  def each_row(&block)
18
18
  handle.rewind
19
19
  Rcsv.parse(handle, :header => :skip, :columns => columns, :row_as_hash => true) do |row|
20
- puts row.inspect
21
20
  yield row
22
21
  end
23
22
  end
@@ -27,7 +27,6 @@ module DataKit
27
27
  if random.rand <= sampling_rate
28
28
  analysis.increment_sample
29
29
  row.keys.each do |field_name|
30
- row[field_name].force_encoding('UTF-8')
31
30
  analysis.insert(field_name.to_s, row[field_name])
32
31
  end
33
32
  end
@@ -1,3 +1,3 @@
1
1
  module DataKit
2
- VERSION = "0.0.8"
2
+ VERSION = "0.0.9"
3
3
  end
@@ -17,4 +17,9 @@ describe DataKit::Converters::Integer do
17
17
  DataKit::Converters::Number.convert(reformatted).should == result
18
18
  end
19
19
  end
20
+
21
+ it "should reformat strings with unknown encodings" do
22
+ str = "9350 Waxie WayÊSuite"
23
+ DataKit::Converters::Number.reformat(str).should == str
24
+ end
20
25
  end
@@ -17,6 +17,10 @@ describe DataKit::CSV::Converter do
17
17
  DataKit::CSV::Parser.new(data_path('utf8.csv'))
18
18
  }
19
19
 
20
+ let(:asciicsv) {
21
+ DataKit::CSV::Parser.new(data_path('vc_backed_companies.csv'))
22
+ }
23
+
20
24
  it "should initialize and execute" do
21
25
  analysis = DataKit::CSV::SchemaAnalyzer.analyze(csv, :sampling_rate => 1)
22
26
  converter = DataKit::CSV::Converter.new(csv, analysis, target)
@@ -47,4 +51,15 @@ describe DataKit::CSV::Converter do
47
51
  CSV.open(target).each { |row| row_count += 1 }
48
52
  row_count.should == 11
49
53
  end
54
+
55
+ it "should convert rows with invalid UTF-8 characters" do
56
+ analysis = DataKit::CSV::SchemaAnalyzer.analyze(asciicsv, :sampling_rate => 1)
57
+ converter = DataKit::CSV::Converter.new(csv, analysis, target)
58
+
59
+ converter.execute
60
+
61
+ row_count = 0
62
+ CSV.open(target).each { |row| row_count += 1 }
63
+ row_count.should == 11
64
+ end
50
65
  end
@@ -52,9 +52,6 @@ describe DataKit::CSV::SchemaAnalyzer do
52
52
  analysis.row_count.should == 10
53
53
  analysis.sample_count.should be < 10
54
54
  analysis.use_type_hints.should == true
55
-
56
- puts analysis.type_hints.inspect
57
-
58
55
  end
59
56
 
60
57
  it "should execute an analysis without type hints" do
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.8
4
+ version: 0.0.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mode Analytics