data_kit 0.0.11 → 0.0.12

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 9a0057bb3688ec21f40b1c696675839e8408b98f
4
- data.tar.gz: 24429670b09afe543ff132f45bc44f14a5840d5e
3
+ metadata.gz: 30b22c22f9efa81ca1011684ed418acfd1fce0e1
4
+ data.tar.gz: 0875d12a8b029661945098eebe35965420a5dbb8
5
5
  SHA512:
6
- metadata.gz: 55c82ae032798b11107b6fed0bc69b1c83b2503c56dde2274c5b02f4f57f82391f92184548048f834bc3beb0658467cfedb1af436accdbd4ae1c66a9b2f79480
7
- data.tar.gz: 4b89e5f6031c28a4a3403e291e92ca278d470368ae437c740155e84f5e4acc11865204b5eb91d228b99855983c82b9eb663e29267853e092e3341d1897e55746
6
+ metadata.gz: 6520bb22fbce4620c012a1757b84582d68e577ccb2d8f730017f4afa9ac8190f9813a056582e220c012a5c0f0ec206fd4abc40af36810b23125758a2cfaa3180
7
+ data.tar.gz: 9518d32e467cce05341449ce4409ccad339e680f38bf8b65ad357d8f96a2311110d7eb3d5f2c826af56bbae5fed0baea16674837bbab8656be548926fab7a040
data/data_kit.gemspec CHANGED
@@ -19,7 +19,6 @@ Gem::Specification.new do |spec|
19
19
  spec.require_paths = ["lib"]
20
20
 
21
21
  # Runtime Dependencies
22
- spec.add_runtime_dependency "rcsv"
23
22
  spec.add_runtime_dependency "timeliness"
24
23
 
25
24
  # Development Dependencies
data/lib/data_kit.rb CHANGED
@@ -20,7 +20,4 @@ require 'data_kit/converters/date_time'
20
20
 
21
21
  # Datasets
22
22
  require 'data_kit/dataset/field'
23
- require 'data_kit/dataset/schema'
24
-
25
- # Patches / Fixes
26
- require 'data_kit/patches/rcsv'
23
+ require 'data_kit/dataset/schema'
@@ -16,11 +16,15 @@ module DataKit
16
16
 
17
17
  def execute
18
18
  ::CSV.open(output_path, 'wb') do |writer|
19
- writer << csv.headers
19
+ first = true
20
+ converted = []
20
21
  csv.each_row do |row|
21
- writer << csv.headers.collect do |field_name|
22
- convert(row[field_name], field_types[field_name])
22
+ if first
23
+ first = false
24
+ writer << csv.headers
23
25
  end
26
+
27
+ writer << convert_row(csv.headers, row)
24
28
  end
25
29
  end
26
30
  end
@@ -39,7 +43,15 @@ module DataKit
39
43
 
40
44
  private
41
45
 
42
- def convert(value, type)
46
+ def convert_row(headers, row)
47
+ converted = []
48
+ headers.each_with_index do |field_name, index|
49
+ converted << convert_value(row[index], field_types[field_name])
50
+ end
51
+ converted
52
+ end
53
+
54
+ def convert_value(value, type)
43
55
  if value.nil? || type == :string || type == :empty
44
56
  return value.to_s
45
57
  else
@@ -14,16 +14,21 @@ module DataKit
14
14
  end
15
15
 
16
16
  def execute
17
+ first = true
18
+ analysis = nil
17
19
  random = Random.new
18
20
 
19
- field_name = csv.headers[field_pos]
20
- analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
21
-
22
21
  csv.each_row do |row|
22
+ if first
23
+ first = false
24
+ field_name = csv.headers[field_pos]
25
+ analysis = FieldAnalysis.new(field_name, { :match_type => match_type })
26
+ end
27
+
23
28
  analysis.increment_total
24
29
  if random.rand <= sampling_rate
25
30
  analysis.increment_sample
26
- analysis.insert(row[field_name])
31
+ analysis.insert(row[field_pos])
27
32
  end
28
33
  end
29
34
 
@@ -1,4 +1,4 @@
1
- require 'rcsv'
1
+ require 'csv'
2
2
 
3
3
  module DataKit
4
4
  module CSV
@@ -9,29 +9,25 @@ module DataKit
9
9
 
10
10
  def initialize(path)
11
11
  @path = path
12
-
13
- set_handle
14
- set_headers
12
+ set_handle
15
13
  end
16
14
 
17
15
  def each_row(&block)
16
+ first = true
18
17
  handle.rewind
19
- Rcsv.parse(handle, :header => :skip, :columns => columns, :row_as_hash => true) do |row|
20
- yield row
18
+
19
+ ::CSV.parse(handle, converters: nil) do |row|
20
+ if first == true
21
+ first = false
22
+ @headers = row
23
+ else
24
+ yield row
25
+ end
21
26
  end
22
27
  end
23
28
 
24
29
  private
25
30
 
26
- def columns
27
- index = -1
28
- @columns ||= headers.inject({}) do |result, field_name|
29
- index += 1
30
- result[index] = { :alias => field_name }
31
- result
32
- end
33
- end
34
-
35
31
  def set_handle
36
32
  if path.is_a?(IO)
37
33
  @handle = path
@@ -44,11 +40,6 @@ module DataKit
44
40
  {:invalid => :replace, :undef => :replace, :replace => ''}
45
41
  )
46
42
  end
47
-
48
- def set_headers
49
- handle.rewind
50
- Rcsv.parse(handle, :header => :none) { |row| @headers = row; break }
51
- end
52
43
  end
53
44
  end
54
45
  end
@@ -19,15 +19,21 @@ module DataKit
19
19
  end
20
20
 
21
21
  def execute
22
+ first = true
23
+ analysis = nil
22
24
  random = Random.new
23
- analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
24
25
 
25
26
  csv.each_row do |row|
27
+ if first
28
+ first = false
29
+ analysis = SchemaAnalysis.new(csv.headers, :use_type_hints => use_type_hints)
30
+ end
31
+
26
32
  analysis.increment_total
27
33
  if random.rand <= sampling_rate
28
34
  analysis.increment_sample
29
- row.keys.each do |field_name|
30
- analysis.insert(field_name.to_s, row[field_name])
35
+ row.each_with_index do |value, index|
36
+ analysis.insert(csv.headers[index].to_s, value)
31
37
  end
32
38
  end
33
39
  end
@@ -1,3 +1,3 @@
1
1
  module DataKit
2
- VERSION = "0.0.11"
2
+ VERSION = "0.0.12"
3
3
  end
metadata CHANGED
@@ -1,29 +1,15 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: data_kit
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.11
4
+ version: 0.0.12
5
5
  platform: ruby
6
6
  authors:
7
7
  - Mode Analytics
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2013-12-18 00:00:00.000000000 Z
11
+ date: 2014-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
- - !ruby/object:Gem::Dependency
14
- name: rcsv
15
- requirement: !ruby/object:Gem::Requirement
16
- requirements:
17
- - - '>='
18
- - !ruby/object:Gem::Version
19
- version: '0'
20
- type: :runtime
21
- prerelease: false
22
- version_requirements: !ruby/object:Gem::Requirement
23
- requirements:
24
- - - '>='
25
- - !ruby/object:Gem::Version
26
- version: '0'
27
13
  - !ruby/object:Gem::Dependency
28
14
  name: timeliness
29
15
  requirement: !ruby/object:Gem::Requirement
@@ -137,7 +123,6 @@ files:
137
123
  - lib/data_kit/csv/schema_analyzer.rb
138
124
  - lib/data_kit/dataset/field.rb
139
125
  - lib/data_kit/dataset/schema.rb
140
- - lib/data_kit/patches/rcsv.rb
141
126
  - lib/data_kit/version.rb
142
127
  - spec/converters/boolean_spec.rb
143
128
  - spec/converters/date_time_spec.rb
@@ -1,124 +0,0 @@
1
- require "rcsv/rcsv"
2
- require "rcsv/version"
3
-
4
- require "stringio"
5
-
6
- #
7
- # This is a temporary monkey patch to Rcsv.parse
8
- # to silence warnings in Ruby 2 about #lines being deprecated
9
- #
10
-
11
- class Rcsv
12
- def self.parse(csv_data, options = {}, &block)
13
- options[:header] ||= :use
14
- raw_options = {}
15
-
16
- raw_options[:col_sep] = options[:column_separator] && options[:column_separator][0] || ','
17
- raw_options[:offset_rows] = options[:offset_rows] || 0
18
- raw_options[:nostrict] = options[:nostrict]
19
- raw_options[:parse_empty_fields_as] = options[:parse_empty_fields_as]
20
- raw_options[:buffer_size] = options[:buffer_size] || 1024 * 1024 # 1 MiB
21
-
22
- if csv_data.is_a?(String)
23
- csv_data = StringIO.new(csv_data)
24
- elsif !(csv_data.respond_to?(:lines) && csv_data.respond_to?(:read))
25
- inspected_csv_data = csv_data.inspect
26
- raise ParseError.new("Supplied CSV object #{inspected_csv_data[0..127]}#{inspected_csv_data.size > 128 ? '...' : ''} is neither String nor looks like IO object.")
27
- end
28
-
29
- if csv_data.respond_to?(:external_encoding)
30
- raw_options[:output_encoding] = csv_data.external_encoding.to_s
31
- end
32
-
33
- initial_position = csv_data.pos
34
-
35
- first_line = csv_data.each_line.first
36
- field_count = first_line.split(raw_options[:col_sep]).length
37
-
38
- case options[:header]
39
- when :use
40
- header = self.raw_parse(StringIO.new(first_line), raw_options).first
41
- raw_options[:offset_rows] += 1
42
- when :skip
43
- header = (0..field_count).to_a
44
- raw_options[:offset_rows] += 1
45
- when :none
46
- header = (0..field_count).to_a
47
- end
48
-
49
- raw_options[:row_as_hash] = options[:row_as_hash] # Setting after header parsing
50
-
51
- if options[:columns]
52
- only_rows = []
53
- except_rows = []
54
- row_defaults = []
55
- column_names = []
56
- row_conversions = ''
57
-
58
- header.each do |column_header|
59
- column_options = options[:columns][column_header]
60
- if column_options
61
- if (options[:row_as_hash])
62
- column_names << (column_options[:alias] || column_header)
63
- end
64
-
65
- row_defaults << column_options[:default] || nil
66
-
67
- only_rows << case column_options[:match]
68
- when Array
69
- column_options[:match]
70
- when nil
71
- nil
72
- else
73
- [column_options[:match]]
74
- end
75
-
76
- except_rows << case column_options[:not_match]
77
- when Array
78
- column_options[:not_match]
79
- when nil
80
- nil
81
- else
82
- [column_options[:not_match]]
83
- end
84
-
85
- row_conversions << case column_options[:type]
86
- when :int
87
- 'i'
88
- when :float
89
- 'f'
90
- when :string
91
- 's'
92
- when :bool
93
- 'b'
94
- when nil
95
- 's' # strings by default
96
- else
97
- fail "Unknown column type #{column_options[:type].inspect}."
98
- end
99
- elsif options[:only_listed_columns]
100
- column_names << nil
101
- row_defaults << nil
102
- only_rows << nil
103
- except_rows << nil
104
- row_conversions << ' '
105
- else
106
- column_names << column_header
107
- row_defaults << nil
108
- only_rows << nil
109
- except_rows << nil
110
- row_conversions << 's'
111
- end
112
- end
113
-
114
- raw_options[:column_names] = column_names if options[:row_as_hash]
115
- raw_options[:only_rows] = only_rows unless only_rows.compact.empty?
116
- raw_options[:except_rows] = except_rows unless except_rows.compact.empty?
117
- raw_options[:row_defaults] = row_defaults unless row_defaults.compact.empty?
118
- raw_options[:row_conversions] = row_conversions
119
- end
120
-
121
- csv_data.pos = initial_position
122
- return self.raw_parse(csv_data, raw_options, &block)
123
- end
124
- end