rover-df 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: ca39a558c3c12103f03fed4cb8f007fbd00a1f8e84b839916fd0010aae4613ba
4
- data.tar.gz: 43df8cdc415cc036ac383f30b7c91a35b644067a3cb8ea199abd7452b98298d5
3
+ metadata.gz: 01e2a90ba133ae07ad6ad482bdca985df806d6a073fa2d93029b2b7e1b55dc49
4
+ data.tar.gz: 96f4171420dea68b38cffdd5a365657bc464f6d1f0c4f6bf1aefb20377c56179
5
5
  SHA512:
6
- metadata.gz: 2724c7e85ee7921f277be833cf89be638c14cbb37a44411bba86c42cacffe7c0e4b82ea04d4dfb3d694c6429ba41bc8e8c10f7cb40e5d34bf59d14755858735f
7
- data.tar.gz: fa860158decbca0a0b35ccb82e6f73d9a513c37b483eca52d140842d5dd255899a2e1ded3ec4375a492b86d3ec09ffa53d4871e05f1fdad39f3d2630215417dc
6
+ metadata.gz: 2451d6844c7ece459e61c8e1499047f8efd6472a0d57317b7e2e1110527d843e8177c16ccbb1aeb0fd61e647fdd4291ebf73d4bfe008560eff7b963b1ac22ee6
7
+ data.tar.gz: 18ad0cfb8fc22aeb63d2e1b11333b1a5989c7bcc0f2b5fbebedb11acf3d3dc26e7235109e1501d0e4b9a06b5aa7e47b71bdb23de6d6fcd87c9fb53d2bf0be330
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.2.6 (2021-10-27)
2
+
3
+ - Added support for `nil` headers to `read_csv` and `parse_csv`
4
+ - Added `read_parquet`, `parse_parquet`, and `to_parquet` methods
5
+
1
6
  ## 0.2.5 (2021-09-25)
2
7
 
3
8
  - Fixed column types with joins
data/README.md CHANGED
@@ -61,6 +61,14 @@ Rover.read_csv("file.csv")
61
61
  Rover.parse_csv("CSV,data,string")
62
62
  ```
63
63
 
64
+ From Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
65
+
66
+ ```ruby
67
+ Rover.read_parquet("file.parquet")
68
+ # or
69
+ Rover.parse_parquet("PAR1...")
70
+ ```
71
+
64
72
  ## Attributes
65
73
 
66
74
  Get number of rows
@@ -89,7 +97,7 @@ Select a column
89
97
  df[:a]
90
98
  ```
91
99
 
92
- > Note that strings and symbols are different keys, just like hashes
100
+ > Note that strings and symbols are different keys, just like hashes. Creating a data frame from Active Record, a CSV, or Parquet uses strings.
93
101
 
94
102
  Select multiple columns
95
103
 
@@ -393,6 +401,12 @@ CSV
393
401
  df.to_csv
394
402
  ```
395
403
 
404
+ Parquet (requires the [red-parquet](https://github.com/apache/arrow/tree/master/ruby/red-parquet) gem) [unreleased]
405
+
406
+ ```ruby
407
+ df.to_parquet
408
+ ```
409
+
396
410
  ## Types
397
411
 
398
412
  You can specify column types when creating a data frame
@@ -235,6 +235,42 @@ module Rover
235
235
  end
236
236
  end
237
237
 
238
+ def to_parquet
239
+ require "parquet"
240
+
241
+ schema = {}
242
+ types.each do |name, type|
243
+ schema[name] =
244
+ case type
245
+ when :int
246
+ :int64
247
+ when :uint
248
+ :uint64
249
+ when :float
250
+ :double
251
+ when :float32
252
+ :float
253
+ when :object
254
+ if @vectors[name].all? { |v| v.is_a?(String) }
255
+ :string
256
+ else
257
+ raise "Unknown type"
258
+ end
259
+ else
260
+ type
261
+ end
262
+ end
263
+ # TODO improve performance
264
+ raw_records = []
265
+ size.times do |i|
266
+ raw_records << @vectors.map { |_, v| v[i] }
267
+ end
268
+ table = Arrow::Table.new(schema, raw_records)
269
+ buffer = Arrow::ResizableBuffer.new(1024)
270
+ table.save(buffer, format: :parquet)
271
+ buffer.data.to_s
272
+ end
273
+
238
274
  # for IRuby
239
275
  def to_html
240
276
  require "iruby"
data/lib/rover/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Rover
2
- VERSION = "0.2.5"
2
+ VERSION = "0.2.6"
3
3
  end
data/lib/rover.rb CHANGED
@@ -19,6 +19,16 @@ module Rover
19
19
  csv_to_df(CSV.parse(str, **csv_options(options)), types: types, headers: options[:headers])
20
20
  end
21
21
 
22
+ def read_parquet(path)
23
+ require "parquet"
24
+ parquet_to_df(Arrow::Table.load(path))
25
+ end
26
+
27
+ def parse_parquet(str)
28
+ require "parquet"
29
+ parquet_to_df(Arrow::Table.load(Arrow::Buffer.new(str), format: :parquet))
30
+ end
31
+
22
32
  private
23
33
 
24
34
  # TODO use date converter
@@ -35,10 +45,49 @@ module Rover
35
45
 
36
46
  table.by_col!
37
47
  data = {}
48
+ keys = table.map { |k, _| [k, true] }.to_h
49
+ unnamed_suffix = 1
38
50
  table.each do |k, v|
51
+ # TODO do same for empty string in 0.3.0
52
+ if k.nil?
53
+ k = "unnamed"
54
+ while keys.include?(k)
55
+ unnamed_suffix += 1
56
+ k = "unnamed#{unnamed_suffix}"
57
+ end
58
+ keys[k] = true
59
+ end
39
60
  data[k] = v
40
61
  end
62
+
41
63
  DataFrame.new(data, types: types)
42
64
  end
65
+
66
+ PARQUET_TYPE_MAPPING = {
67
+ "float" => Numo::SFloat,
68
+ "double" => Numo::DFloat,
69
+ "int8" => Numo::Int8,
70
+ "int16" => Numo::Int16,
71
+ "int32" => Numo::Int32,
72
+ "int64" => Numo::Int64,
73
+ "string" => Numo::RObject,
74
+ "uint8" => Numo::UInt8,
75
+ "uint16" => Numo::UInt16,
76
+ "uint32" => Numo::UInt32,
77
+ "uint64" => Numo::UInt64
78
+ }
79
+
80
+ def parquet_to_df(table)
81
+ data = {}
82
+ table.each_column do |column|
83
+ k = column.field.name
84
+ type = column.field.data_type.to_s
85
+ numo_type = PARQUET_TYPE_MAPPING[type]
86
+ raise "Unknown type: #{type}" unless numo_type
87
+ # TODO improve performance
88
+ data[k] = numo_type.cast(column.data.values)
89
+ end
90
+ DataFrame.new(data)
91
+ end
43
92
  end
44
93
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: rover-df
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.5
4
+ version: 0.2.6
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-09-25 00:00:00.000000000 Z
11
+ date: 2021-10-27 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: numo-narray