orcfile 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/README.rdoc +99 -0
- data/lib/jars/commons-configuration-1.10.jar +0 -0
- data/lib/jars/commons-logging-1.2.jar +0 -0
- data/lib/jars/hadoop-core-1.2.1.jar +0 -0
- data/lib/jars/hive-exec-2.1.1.jar +0 -0
- data/lib/jars/slf4j-api-1.7.9.jar +0 -0
- data/lib/jars/slf4j-simple-1.7.20.jar +0 -0
- data/lib/orc_file.rb +25 -0
- data/lib/orc_file_reader.rb +49 -0
- data/lib/orc_file_writer.rb +66 -0
- data/lib/orc_options.rb +48 -0
- data/lib/orc_reader_options.rb +9 -0
- data/lib/orc_schema.rb +28 -0
- metadata +87 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ee87da3cc76f9655fd8393da478578d57143210e
|
4
|
+
data.tar.gz: dcac0b20f2f932694b3eab4a1bc32bcf0e7037d1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1a50f7b1a22662b433b213bf0e56d0b012e1c6c07c0773caadf6f6095307fee78ebfd2ac6d94f6d4e699cf1e75ef6c992075e68a51732a3fa010a33d770e5bc5
|
7
|
+
data.tar.gz: b9c8e252962f7215828636923ebead9ca8f2d122cadb5d559cac77d2e453ce7dafcf089f9d9ff05ef0cea9036f2663e97abf766295a9ea8c1c95269a38e4d3c9
|
data/README.rdoc
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
=ORCFILE
|
2
|
+
Ruby Gem for reading and writing Apache Optimized Row Columnar (ORC) files.
|
3
|
+
This gem can also be paired using the factory_girl gem.
|
4
|
+
|
5
|
+
==Installation
|
6
|
+
Must use jruby.
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'orc_file'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle install
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install orc_file
|
19
|
+
|
20
|
+
==Usage
|
21
|
+
===+OrcFileWriter+
|
22
|
+
To write a file, you will need to initialize the OrcFileWriter class.
|
23
|
+
This object needs a table schema, your dataset, the path to store the file, and an optional configuration hash.
|
24
|
+
|
25
|
+
OrcFileWriter.new(table_schema, data_set, path, *options={})
|
26
|
+
====_table_schema_
|
27
|
+
The table_schema must be a hash containing the column name and datatype as the key-value pair.
|
28
|
+
|
29
|
+
Valid datatypes are:
|
30
|
+
- integer
|
31
|
+
- decimal
|
32
|
+
- float
|
33
|
+
- date
|
34
|
+
- datetime
|
35
|
+
- time
|
36
|
+
- string
|
37
|
+
|
38
|
+
|
39
|
+
table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
|
40
|
+
|
41
|
+
====_data_set_
|
42
|
+
The data_set must contain a hash with the column name and data value as the key-value pair.
|
43
|
+
|
44
|
+
For one row in the dataset:
|
45
|
+
|
46
|
+
data_set = {:id => 1, :amount => 1000.01, :rate => 0.0005}
|
47
|
+
|
48
|
+
For multiple rows in the dataset:
|
49
|
+
|
50
|
+
dataset = [{:id => 1, :amount => 1000.01, :rate => 0.0005},
|
51
|
+
{:id => 2, :amount => 2500.5, :rate => 0.1},
|
52
|
+
{:id => 3, :amount => 10.12, :rate => 10.0134}]
|
53
|
+
|
54
|
+
====_path_
|
55
|
+
The path should be the full file path or relative to your working directory. You must also specify the file name.
|
56
|
+
|
57
|
+
path = '/temp/orc_file.orc'
|
58
|
+
|
59
|
+
====_options_
|
60
|
+
Options is an optional hash parameter containing 5 configurable settings for writing an ORC file.
|
61
|
+
|
62
|
+
`:stripe_size` defines the size of the stripe, defaulted as 67,108,864 bytes <br>
|
63
|
+
`:row_index_stride` defines the number of rows between row index entries, defaulted as 10,000 <br>
|
64
|
+
`:buffer_size` defines the orc buffer size, defaulted as 262,144 bytes <br>
|
65
|
+
`:compression` defines the compression codec (NONE,ZLIB,SNAPPY,LZO), defaulted as ZLIB. <br>
|
66
|
+
|
67
|
+
Define the options parameter has a hash
|
68
|
+
|
69
|
+
options = {:stripe_size => 70000000, :compression => 'SNAPPY'}
|
70
|
+
|
71
|
+
===+write_to_orc+
|
72
|
+
Once you have the OrcFileWriter object initialized you must call write_to_orc to write out the file
|
73
|
+
|
74
|
+
OrcFileWriter.new(table_schema, data_set, path, options).write_to_orc
|
75
|
+
|
76
|
+
===+OrcFileReader+
|
77
|
+
To read a file, you will need to initialize the OrcFileReader class.
|
78
|
+
This object needs a table schema, and the path of the file to be read.
|
79
|
+
|
80
|
+
OrcFileReader.new(table_schema, path)
|
81
|
+
====_table_schema_
|
82
|
+
The table_schema must be a hash containing the column name and datatype as the key-value pair.
|
83
|
+
|
84
|
+
Valid datatypes are:
|
85
|
+
- integer
|
86
|
+
- decimal
|
87
|
+
- float
|
88
|
+
- date
|
89
|
+
- datetime
|
90
|
+
- time
|
91
|
+
- string
|
92
|
+
|
93
|
+
|
94
|
+
table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
|
95
|
+
|
96
|
+
====_path_
|
97
|
+
The path should be the full file path or relative to your working directory. You must also specify the file name.
|
98
|
+
|
99
|
+
path = '/temp/orc_file.orc'
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/orc_file.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'bigdecimal'
|
3
|
+
require 'time'
|
4
|
+
require 'jars/slf4j-api-1.7.9.jar'
|
5
|
+
require 'jars/commons-logging-1.2.jar'
|
6
|
+
require 'jars/commons-configuration-1.10.jar'
|
7
|
+
require 'jars/slf4j-simple-1.7.20.jar'
|
8
|
+
require 'jars/hadoop-core-1.2.1.jar'
|
9
|
+
require 'jars/hive-exec-2.1.1.jar'
|
10
|
+
require 'orc_schema'
|
11
|
+
require 'orc_options'
|
12
|
+
require 'orc_reader_options'
|
13
|
+
require 'orc_file_writer'
|
14
|
+
require 'orc_file_reader'
|
15
|
+
|
16
|
+
java_import 'org.slf4j.LoggerFactory'
|
17
|
+
java_import 'org.apache.hadoop.hive.common.type.HiveDecimal'
|
18
|
+
java_import 'org.apache.hadoop.conf.Configuration'
|
19
|
+
java_import 'org.apache.hadoop.fs.Path'
|
20
|
+
java_import 'org.apache.orc.CompressionKind'
|
21
|
+
java_import 'org.apache.orc.TypeDescription'
|
22
|
+
java_import 'org.apache.orc.OrcFile'
|
23
|
+
|
24
|
+
|
25
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
class OrcFileReader
|
2
|
+
attr_reader :reader, :orc_options, :table_schema
|
3
|
+
|
4
|
+
def initialize(table_schema, path='orc_file.orc')
|
5
|
+
@orc_options = OrcReaderOptions.new
|
6
|
+
@table_schema = table_schema
|
7
|
+
path = Path.new(path)
|
8
|
+
@reader = OrcFile.createReader(path, @orc_options.orc)
|
9
|
+
end
|
10
|
+
|
11
|
+
def read_row(row_batch, row_index)
|
12
|
+
orc_row = {}
|
13
|
+
row_batch.cols.each_with_index do |column, index|
|
14
|
+
column_name = @table_schema.keys[index]
|
15
|
+
data_type = @table_schema[column_name]
|
16
|
+
case data_type
|
17
|
+
when :integer
|
18
|
+
orc_row[column_name] = column.vector[row_index]
|
19
|
+
when :decimal
|
20
|
+
orc_row[column_name] = column.vector[row_index].get_hive_decimal.to_s.to_d
|
21
|
+
when :float
|
22
|
+
#sets float value as 0.0005000000237487257 instead of 0.0005
|
23
|
+
orc_row[column_name] = column.vector[row_index]
|
24
|
+
when :datetime
|
25
|
+
orc_row[column_name] = DateTime.strptime(column.time[row_index].to_s, '%Q').to_time.to_datetime
|
26
|
+
when :time
|
27
|
+
orc_row[column_name] = Time.strptime(column.time[row_index].to_s, '%Q')
|
28
|
+
when :date
|
29
|
+
# orc_row[column_name] = Time.at(column.vector.first * 86400).to_date
|
30
|
+
orc_row[column_name] = Date.new(1970,1,1) + column.vector[row_index]
|
31
|
+
when :string
|
32
|
+
orc_row[column_name] = column.toString(row_index)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
orc_row
|
36
|
+
end
|
37
|
+
|
38
|
+
def read_from_orc
|
39
|
+
rows = Array.new
|
40
|
+
row_batch = @reader.get_schema.createRowBatch()
|
41
|
+
@reader.rows.next_batch(row_batch)
|
42
|
+
|
43
|
+
@reader.number_of_rows.times do |row_index|
|
44
|
+
rows << read_row(row_batch, row_index)
|
45
|
+
end
|
46
|
+
rows
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'orc_options'
|
2
|
+
|
3
|
+
class OrcFileWriter
|
4
|
+
attr_reader :writer, :orc_options, :data_set, :table_schema, :output_path
|
5
|
+
|
6
|
+
def initialize(table_schema, data_set, path='orc_file.orc', options={})
|
7
|
+
@orc_options = OrcOptions.new
|
8
|
+
@orc_options.set_options(options)
|
9
|
+
@table_schema = table_schema
|
10
|
+
@data_set = data_set
|
11
|
+
@orc_options.define_table_schema(table_schema)
|
12
|
+
path.concat '.orc' unless path.include? '.orc'
|
13
|
+
path_object = Path.new(path)
|
14
|
+
@writer = OrcFile.createWriter(path_object, @orc_options.orc)
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_row(row)
|
18
|
+
orc_row = @orc_options.orc_schema.schema.createRowBatch()
|
19
|
+
orc_row.size = 1
|
20
|
+
row.each_with_index do |(key, value), index|
|
21
|
+
data_type = @table_schema[key]
|
22
|
+
if value.nil?
|
23
|
+
orc_row.cols[index].noNulls = false
|
24
|
+
case data_type
|
25
|
+
when :datetime, :time, :decimal
|
26
|
+
orc_row.cols[index].set(0, value)
|
27
|
+
else
|
28
|
+
orc_row.cols[index].fill_with_nulls
|
29
|
+
end
|
30
|
+
else
|
31
|
+
case data_type
|
32
|
+
when :integer
|
33
|
+
data_for_column = value.to_java(:long)
|
34
|
+
when :decimal
|
35
|
+
data_for_column = HiveDecimal.create(value.to_d.to_java)
|
36
|
+
when :float
|
37
|
+
data_for_column = value.to_java(:double)
|
38
|
+
when :datetime, :time
|
39
|
+
data_for_column = value.to_time
|
40
|
+
when :date
|
41
|
+
# hive needs date formated as number of days since epoch (01/01/1970)
|
42
|
+
data_for_column = (value - Date.new(1970, 1, 1)).to_i
|
43
|
+
when :string
|
44
|
+
orc_row.cols[index].initBuffer(value.to_s.bytes.to_a.size)
|
45
|
+
data_for_column = value.to_s.bytes
|
46
|
+
else
|
47
|
+
raise ArgumentError, "column data type #{data_type} not defined"
|
48
|
+
end
|
49
|
+
orc_row.cols[index].fill(data_for_column)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
orc_row
|
53
|
+
end
|
54
|
+
|
55
|
+
def write_to_orc
|
56
|
+
if @data_set.is_a? Array
|
57
|
+
@data_set.each do |row|
|
58
|
+
@writer.addRowBatch(create_row(row))
|
59
|
+
end
|
60
|
+
else
|
61
|
+
@writer.addRowBatch(create_row(@data_set))
|
62
|
+
end
|
63
|
+
@writer.close
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
data/lib/orc_options.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'orc_schema'
|
2
|
+
|
3
|
+
class OrcOptions
|
4
|
+
attr_reader :orc_schema, :orc
|
5
|
+
|
6
|
+
def initialize()
|
7
|
+
conf = Configuration.new
|
8
|
+
@orc_schema = OrcSchema.new
|
9
|
+
@orc = OrcFile.writerOptions(conf)
|
10
|
+
end
|
11
|
+
|
12
|
+
def define_table_schema(table_schema)
|
13
|
+
raise TypeError, 'table_schema must be a Hash of {column_name: data_type}' unless table_schema.is_a? Hash
|
14
|
+
raise ArgumentError, 'table_schema cannot be an empty hash' if table_schema.empty?
|
15
|
+
table_schema.each do |column_name, data_type|
|
16
|
+
@orc_schema.add_column(column_name, data_type)
|
17
|
+
end
|
18
|
+
@orc.setSchema(@orc_schema.schema)
|
19
|
+
end
|
20
|
+
|
21
|
+
def define_stripe_size(stripe_size)
|
22
|
+
@orc.stripeSize(stripe_size)
|
23
|
+
end
|
24
|
+
|
25
|
+
def define_row_index_stride(row_index_stride)
|
26
|
+
@orc.rowIndexStride(row_index_stride)
|
27
|
+
end
|
28
|
+
|
29
|
+
def define_buffer_size(buffer_size)
|
30
|
+
@orc.bufferSize(buffer_size)
|
31
|
+
end
|
32
|
+
|
33
|
+
def define_compression(compression)
|
34
|
+
begin
|
35
|
+
@orc.compress(CompressionKind.valueOf(compression))
|
36
|
+
rescue java.lang.IllegalArgumentException
|
37
|
+
raise ArgumentError, "#{compression} is not a valid CompressionKind. Must be one of the following: \n#{CompressionKind.constants}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def set_options(opts)
|
42
|
+
define_stripe_size(opts[:stripe_size]) unless opts[:stripe_size].nil?
|
43
|
+
define_row_index_stride(opts[:row_index_stride]) unless opts[:row_index_stride].nil?
|
44
|
+
define_buffer_size(opts[:buffer_size]) unless opts[:buffer_size].nil?
|
45
|
+
define_compression(opts[:compression]) unless opts[:compression].nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
data/lib/orc_schema.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
class OrcSchema
|
2
|
+
attr_reader :schema
|
3
|
+
def initialize
|
4
|
+
@schema = TypeDescription.createStruct()
|
5
|
+
end
|
6
|
+
|
7
|
+
def add_column(column_name, data_type)
|
8
|
+
case data_type.downcase.to_sym
|
9
|
+
when :integer
|
10
|
+
type = TypeDescription.createLong()
|
11
|
+
when :datetime, :time
|
12
|
+
type = TypeDescription.createTimestamp()
|
13
|
+
when :date
|
14
|
+
type = TypeDescription.createDate()
|
15
|
+
when :decimal
|
16
|
+
type = TypeDescription.createDecimal()
|
17
|
+
when :float
|
18
|
+
type = TypeDescription.createFloat()
|
19
|
+
when :double
|
20
|
+
type = TypeDescription.createDouble()
|
21
|
+
when :string
|
22
|
+
type = TypeDescription.createString()
|
23
|
+
else
|
24
|
+
raise ArgumentError, "column data type #{data_type} not defined"
|
25
|
+
end
|
26
|
+
@schema.addField(column_name.to_s, type)
|
27
|
+
end
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: orcfile
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Shane
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-01-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ">="
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '0'
|
19
|
+
name: rspec
|
20
|
+
prerelease: false
|
21
|
+
type: :runtime
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
name: java
|
34
|
+
prerelease: false
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: This gem allows for the creation and reading of Apache Hive Optimized
|
42
|
+
Row Columnar (ORC) files.
|
43
|
+
email:
|
44
|
+
- ashane9@gmail.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- README.rdoc
|
50
|
+
- lib/jars/commons-configuration-1.10.jar
|
51
|
+
- lib/jars/commons-logging-1.2.jar
|
52
|
+
- lib/jars/hadoop-core-1.2.1.jar
|
53
|
+
- lib/jars/hive-exec-2.1.1.jar
|
54
|
+
- lib/jars/slf4j-api-1.7.9.jar
|
55
|
+
- lib/jars/slf4j-simple-1.7.20.jar
|
56
|
+
- lib/orc_file.rb
|
57
|
+
- lib/orc_file_reader.rb
|
58
|
+
- lib/orc_file_writer.rb
|
59
|
+
- lib/orc_options.rb
|
60
|
+
- lib/orc_reader_options.rb
|
61
|
+
- lib/orc_schema.rb
|
62
|
+
homepage:
|
63
|
+
licenses: []
|
64
|
+
metadata:
|
65
|
+
source_code_uri: https://github.com/ashane9/orc_file
|
66
|
+
allowed_push_host: https://rubygems.org
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 2.6.14
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: Reader/writer of Hive ORC files
|
87
|
+
test_files: []
|