data_kit 0.0.11 → 0.0.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a0057bb3688ec21f40b1c696675839e8408b98f
4
- data.tar.gz: 24429670b09afe543ff132f45bc44f14a5840d5e
3
+ metadata.gz: 30b22c22f9efa81ca1011684ed418acfd1fce0e1
4
+ data.tar.gz: 0875d12a8b029661945098eebe35965420a5dbb8
5
5
  SHA512:
6
- metadata.gz: 55c82ae032798b11107b6fed0bc69b1c83b2503c56dde2274c5b02f4f57f82391f92184548048f834bc3beb0658467cfedb1af436accdbd4ae1c66a9b2f79480
7
- data.tar.gz: 4b89e5f6031c28a4a3403e291e92ca278d470368ae437c740155e84f5e4acc11865204b5eb91d228b99855983c82b9eb663e29267853e092e3341d1897e55746
6
+ metadata.gz: 6520bb22fbce4620c012a1757b84582d68e577ccb2d8f730017f4afa9ac8190f9813a056582e220c012a5c0f0ec206fd4abc40af36810b23125758a2cfaa3180
7
+ data.tar.gz: 9518d32e467cce05341449ce4409ccad339e680f38bf8b65ad357d8f96a2311110d7eb3d5f2c826af56bbae5fed0baea16674837bbab8656be548926fab7a040
data/data_kit.gemspec CHANGED
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
19
19
  spec.require_paths = ["lib"]
20
20
 
21
21
  # Runtime Dependencies
22
- spec.add_runtime_dependency "rcsv"
23
22
  spec.add_runtime_dependency "timeliness"
24
23
 
25
24
  # Development Dependencies
data/lib/data_kit.rb CHANGED
@@ -20,7 +20,4 @@ require 'data_kit/converters/date_time'
20
20
 
21
21
  # Datasets
22
22
  require 'data_kit/dataset/field'
23
- require 'data_kit/dataset/schema'
24
-
25
- # Patches / Fixes
26
- require 'data_kit/patches/rcsv'
23
+ require 'data_kit/dataset/schema'
@@ -16,11 +16,15 @@ module DataKit
16
16
 
17
17
  def execute
18
18
  ::CSV.open(output_path, 'wb') do |writer|
19
- writer << csv.headers
19
+ first = true
20
+ converted = []
20
21
  csv.each_row do |row|
21
- writer << csv.headers.collect do |field_name|
22
- convert(row[field_name], field_types[field_name])
22
+ if first
23
+ first = false
24
+ writer << csv.headers
23
25
  end
26
+
27
+ writer << convert_row(csv.headers, row)
24
28
  end
25
29
  end
26
30
  end
@@ -39,7 +43,15 @@ module DataKit
39
43
 
40
44
  private
41
45
 
42
- def convert(value, type)
46
+ def convert_row(headers, row)
47
+ converted = []
48
+ headers.each_with_index do |field_name, index|
49
+ converted << convert_value(row[index], field_types[field_name])
50
+ end
51
+ converted
52
+ end
53
+
54
+ def convert_value(value, type)
43
55
  if value.nil? || type == :string || type == :empty
44
56
  return value.to_s
45
57
  else
@@ -14,16 +14,21 @@ module DataKit
14
14
  end
15
15
 
16
16
  def execute
17
+ first = true
18
+ analysis = nil
17
19
  random = Random.new
18
20
 
19
- field_name = csv.headers[field_pos]
20
- analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
21
-
22
21
  csv.each_row do |row|
22
+ if first
23
+ first = false
24
+ field_name = csv.headers[field_pos]
25
+ analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
26
+ end
27
+
23
28
  analysis.increment_total
24
29
  if random.rand <= sampling_rate
25
30
  analysis.increment_sample
26
- analysis.insert(row[field_name])
31
+ analysis.insert(row[field_pos])
27
32
  end
28
33
  end
29
34
 
@@ -1,4 +1,4 @@
1
- require 'rcsv'
1
+ require 'csv'
2
2
 
3
3
  module DataKit
4
4
  module CSV
@@ -9,29 +9,25 @@ module DataKit
9
9
 
10
10
  def initialize(path)
11
11
  @path = path
12
-
13
- set_handle
14
- set_headers
12
+ set_handle
15
13
  end
16
14
 
17
15
  def each_row(&block)
16
+ first = true
18
17
  handle.rewind
19
- Rcsv.parse(handle, :header => :skip, :columns => columns, :row_as_hash => true) do |row|
20
- yield row
18
+
19
+ ::CSV.parse(handle, converters: nil) do |row|
20
+ if first == true
21
+ first = false
22
+ @headers = row
23
+ else
24
+ yield row
25
+ end
21
26
  end
22
27
  end
23
28
 
24
29
  private
25
30
 
26
- def columns
27
- index = -1
28
- @columns ||= headers.inject({}) do |result, field_name|
29
- index += 1
30
- result[index] = { :alias => field_name }
31
- result
32
- end
33
- end
34
-
35
31
  def set_handle
36
32
  if path.is_a?(IO)
37
33
  @handle = path
@@ -44,11 +40,6 @@ module DataKit
44
40
  {:invalid => :replace, :undef => :replace, :replace => ''}
45
41
  )
46
42
  end
47
-
48
- def set_headers
49
- handle.rewind
50
- Rcsv.parse(handle, :header => :none) { |row| @headers = row; break }
51
- end
52
43
  end
53
44
  end
54
45
  end
@@ -19,15 +19,21 @@ module DataKit
19
19
  end
20
20
 
21
21
  def execute
22
+ first = true
23
+ analysis = nil
22
24
  random = Random.new
23
- analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
24
25
 
25
26
  csv.each_row do |row|
27
+ if first
28
+ first = false
29
+ analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
30
+ end
31
+
26
32
  analysis.increment_total
27
33
  if random.rand <= sampling_rate
28
34
  analysis.increment_sample
29
- row.keys.each do |field_name|
30
- analysis.insert(field_name.to_s, row[field_name])
35
+ row.each_with_index do |value, index|
36
+ analysis.insert(csv.headers[index].to_s, value)
31
37
  end
32
38
  end
33
39
  end
@@ -1,3 +1,3 @@
1
1
  module DataKit
2
- VERSION = "0.0.11"
2
+ VERSION = "0.0.12"
3
3
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mode Analytics
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-18 00:00:00.000000000 Z
11
+ date: 2014-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: rcsv
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - '>='
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - '>='
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: timeliness
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -137,7 +123,6 @@ files:
137
123
  - lib/data_kit/csv/schema_analyzer.rb
138
124
  - lib/data_kit/dataset/field.rb
139
125
  - lib/data_kit/dataset/schema.rb
140
- - lib/data_kit/patches/rcsv.rb
141
126
  - lib/data_kit/version.rb
142
127
  - spec/converters/boolean_spec.rb
143
128
  - spec/converters/date_time_spec.rb
@@ -1,124 +0,0 @@
1
- require "rcsv/rcsv"
2
- require "rcsv/version"
3
-
4
- require "stringio"
5
-
6
- #
7
- # This is a temporary monkey patch to Rcsv.parse
8
- # to silence warnings in Ruby 2 about #lines being deprecated
9
- #
10
-
11
- class Rcsv
12
- def self.parse(csv_data, options = {}, &block)
13
- options[:header] ||= :use
14
- raw_options = {}
15
-
16
- raw_options[:col_sep] = options[:column_separator] && options[:column_separator][0] || ','
17
- raw_options[:offset_rows] = options[:offset_rows] || 0
18
- raw_options[:nostrict] = options[:nostrict]
19
- raw_options[:parse_empty_fields_as] = options[:parse_empty_fields_as]
20
- raw_options[:buffer_size] = options[:buffer_size] || 1024 * 1024 # 1 MiB
21
-
22
- if csv_data.is_a?(String)
23
- csv_data = StringIO.new(csv_data)
24
- elsif !(csv_data.respond_to?(:lines) && csv_data.respond_to?(:read))
25
- inspected_csv_data = csv_data.inspect
26
- raise ParseError.new("Supplied CSV object #{inspected_csv_data[0..127]}#{inspected_csv_data.size > 128 ? '...' : ''} is neither String nor looks like IO object.")
27
- end
28
-
29
- if csv_data.respond_to?(:external_encoding)
30
- raw_options[:output_encoding] = csv_data.external_encoding.to_s
31
- end
32
-
33
- initial_position = csv_data.pos
34
-
35
- first_line = csv_data.each_line.first
36
- field_count = first_line.split(raw_options[:col_sep]).length
37
-
38
- case options[:header]
39
- when :use
40
- header = self.raw_parse(StringIO.new(first_line), raw_options).first
41
- raw_options[:offset_rows] += 1
42
- when :skip
43
- header = (0..field_count).to_a
44
- raw_options[:offset_rows] += 1
45
- when :none
46
- header = (0..field_count).to_a
47
- end
48
-
49
- raw_options[:row_as_hash] = options[:row_as_hash] # Setting after header parsing
50
-
51
- if options[:columns]
52
- only_rows = []
53
- except_rows = []
54
- row_defaults = []
55
- column_names = []
56
- row_conversions = ''
57
-
58
- header.each do |column_header|
59
- column_options = options[:columns][column_header]
60
- if column_options
61
- if (options[:row_as_hash])
62
- column_names << (column_options[:alias] || column_header)
63
- end
64
-
65
- row_defaults << column_options[:default] || nil
66
-
67
- only_rows << case column_options[:match]
68
- when Array
69
- column_options[:match]
70
- when nil
71
- nil
72
- else
73
- [column_options[:match]]
74
- end
75
-
76
- except_rows << case column_options[:not_match]
77
- when Array
78
- column_options[:not_match]
79
- when nil
80
- nil
81
- else
82
- [column_options[:not_match]]
83
- end
84
-
85
- row_conversions << case column_options[:type]
86
- when :int
87
- 'i'
88
- when :float
89
- 'f'
90
- when :string
91
- 's'
92
- when :bool
93
- 'b'
94
- when nil
95
- 's' # strings by default
96
- else
97
- fail "Unknown column type #{column_options[:type].inspect}."
98
- end
99
- elsif options[:only_listed_columns]
100
- column_names << nil
101
- row_defaults << nil
102
- only_rows << nil
103
- except_rows << nil
104
- row_conversions << ' '
105
- else
106
- column_names << column_header
107
- row_defaults << nil
108
- only_rows << nil
109
- except_rows << nil
110
- row_conversions << 's'
111
- end
112
- end
113
-
114
- raw_options[:column_names] = column_names if options[:row_as_hash]
115
- raw_options[:only_rows] = only_rows unless only_rows.compact.empty?
116
- raw_options[:except_rows] = except_rows unless except_rows.compact.empty?
117
- raw_options[:row_defaults] = row_defaults unless row_defaults.compact.empty?
118
- raw_options[:row_conversions] = row_conversions
119
- end
120
-
121
- csv_data.pos = initial_position
122
- return self.raw_parse(csv_data, raw_options, &block)
123
- end
124
- end