rover-df 0.2.6 → 0.2.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 01e2a90ba133ae07ad6ad482bdca985df806d6a073fa2d93029b2b7e1b55dc49
4
- data.tar.gz: 96f4171420dea68b38cffdd5a365657bc464f6d1f0c4f6bf1aefb20377c56179
3
+ metadata.gz: c75bed3989211e806e54c296290e5f7b3af236a15742daac876e211e3ca5a76f
4
+ data.tar.gz: 5865ff8f1d0036423f18cfee867da63214ee50f79d373b0f0f244853d8efbefa
5
5
  SHA512:
6
- metadata.gz: 2451d6844c7ece459e61c8e1499047f8efd6472a0d57317b7e2e1110527d843e8177c16ccbb1aeb0fd61e647fdd4291ebf73d4bfe008560eff7b963b1ac22ee6
7
- data.tar.gz: 18ad0cfb8fc22aeb63d2e1b11333b1a5989c7bcc0f2b5fbebedb11acf3d3dc26e7235109e1501d0e4b9a06b5aa7e47b71bdb23de6d6fcd87c9fb53d2bf0be330
6
+ metadata.gz: 11718bc8ade75a605e92cabe05c29e55c6d4dfe427cd5ada0a8a216db678b32a88f4a43843d1e7dcda7b7a64adb63b76969f1d958e91ca57c4f71989632e14aa
7
+ data.tar.gz: 16940236090625bef69cb14d6d9f9f50720314edea1b5892f60443799e5389700ddfb0d79a29ee1e193168097add9d7195799e7f049d85f9c9dc9c443843a678
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.2.7 (2022-01-16)
2
+
3
+ - Added support for booleans to Parquet methods
4
+ - Added support for creating data frames from `ActiveRecord::Result`
5
+ - Added `types` option to `read_parquet` and `parse_parquet` methods
6
+
1
7
  ## 0.2.6 (2021-10-27)
2
8
 
3
9
  - Added support for `nil` headers to `read_csv` and `parse_csv`
data/LICENSE.txt CHANGED
@@ -1,4 +1,4 @@
1
- Copyright (c) 2020-2021 Andrew Kane
1
+ Copyright (c) 2020-2022 Andrew Kane
2
2
 
3
3
  MIT License
4
4
 
data/README.md CHANGED
@@ -61,7 +61,7 @@ Rover.read_csv("file.csv")
61
61
  Rover.parse_csv("CSV,data,string")
62
62
  ```
63
63
 
64
- From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
64
+ From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
65
65
 
66
66
  ```ruby
67
67
  Rover.read_parquet("file.parquet")
@@ -401,7 +401,7 @@ CSV
401
401
  df.to_csv
402
402
  ```
403
403
 
404
- Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
404
+ Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem)
405
405
 
406
406
  ```ruby
407
407
  df.to_parquet
@@ -40,8 +40,8 @@ module Rover
40
40
  vectors.each do |k, v|
41
41
  @vectors[k] = to_vector(v, type: types[k])
42
42
  end
43
- elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base))
44
- result = data.connection.select_all(data.all.to_sql)
43
+ elsif defined?(ActiveRecord) && (data.is_a?(ActiveRecord::Relation) || (data.is_a?(Class) && data < ActiveRecord::Base) || data.is_a?(ActiveRecord::Result))
44
+ result = data.is_a?(ActiveRecord::Result) ? data : data.connection.select_all(data.all.to_sql)
45
45
  result.columns.each_with_index do |k, i|
46
46
  @vectors[k] = to_vector(result.rows.map { |r| r[i] }, type: types[k])
47
47
  end
@@ -250,6 +250,8 @@ module Rover
250
250
  :double
251
251
  when :float32
252
252
  :float
253
+ when :bool
254
+ :boolean
253
255
  when :object
254
256
  if @vectors[name].all? { |v| v.is_a?(String) }
255
257
  :string
data/lib/rover/vector.rb CHANGED
@@ -359,6 +359,7 @@ module Rover
359
359
  data = data.to_a
360
360
 
361
361
  if type
362
+ data = data.map { |v| v || Float::NAN } if [:float, :float32].include?(type)
362
363
  data = numo_type.cast(data)
363
364
  else
364
365
  data =
data/lib/rover/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rover
2
- VERSION = "0.2.6"
2
+ VERSION = "0.2.7"
3
3
  end
data/lib/rover.rb CHANGED
@@ -19,14 +19,14 @@ module Rover
19
19
  csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
20
20
  end
21
21
 
22
- def read_parquet(path)
22
+ def read_parquet(path, types: nil)
23
23
  require "parquet"
24
- parquet_to_df(Arrow::Table.load(path))
24
+ parquet_to_df(Arrow::Table.load(path), types: types)
25
25
  end
26
26
 
27
- def parse_parquet(str)
27
+ def parse_parquet(str, types: nil)
28
28
  require "parquet"
29
- parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet))
29
+ parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet), types: types)
30
30
  end
31
31
 
32
32
  private
@@ -64,6 +64,7 @@ module Rover
64
64
  end
65
65
 
66
66
  PARQUET_TYPE_MAPPING = {
67
+ "bool" => Numo::Bit,
67
68
  "float" => Numo::SFloat,
68
69
  "double" => Numo::DFloat,
69
70
  "int8" => Numo::Int8,
@@ -77,15 +78,28 @@ module Rover
77
78
  "uint64" => Numo::UInt64
78
79
  }
79
80
 
80
- def parquet_to_df(table)
81
+ def parquet_to_df(table, types: nil)
81
82
  data = {}
83
+ types ||= {}
82
84
  table.each_column do |column|
83
85
  k = column.field.name
84
- type = column.field.data_type.to_s
85
- numo_type = PARQUET_TYPE_MAPPING[type]
86
- raise "Unknown type: #{type}" unless numo_type
87
- # TODO improve performance
88
- data[k] = numo_type.cast(column.data.values)
86
+ if types[k]
87
+ data[k] = Vector.new(column.data.values, type: types[k])
88
+ else
89
+ type = column.field.data_type.to_s
90
+ numo_type = PARQUET_TYPE_MAPPING[type]
91
+ raise "Unknown type: #{type}" unless numo_type
92
+
93
+ # TODO automatic conversion?
94
+ # int => float
95
+ # bool => object
96
+ if (type.include?("int") || type == "bool") && column.n_nulls > 0
97
+ raise "Nulls not supported for #{type} column: #{k}"
98
+ end
99
+
100
+ # TODO improve performance
101
+ data[k] = numo_type.cast(column.data.values)
102
+ end
89
103
  end
90
104
  DataFrame.new(data)
91
105
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rover-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.6
4
+ version: 0.2.7
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-27 00:00:00.000000000 Z
11
+ date: 2022-01-16 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray
@@ -58,7 +58,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
58
58
  - !ruby/object:Gem::Version
59
59
  version: '0'
60
60
  requirements: []
61
- rubygems_version: 3.2.22
61
+ rubygems_version: 3.3.3
62
62
  signing_key:
63
63
  specification_version: 4
64
64
  summary: Simple, powerful data frames for Ruby