orcfile 1.0.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/README.rdoc +99 -0
- data/lib/jars/commons-configuration-1.10.jar +0 -0
- data/lib/jars/commons-logging-1.2.jar +0 -0
- data/lib/jars/hadoop-core-1.2.1.jar +0 -0
- data/lib/jars/hive-exec-2.1.1.jar +0 -0
- data/lib/jars/slf4j-api-1.7.9.jar +0 -0
- data/lib/jars/slf4j-simple-1.7.20.jar +0 -0
- data/lib/orc_file.rb +25 -0
- data/lib/orc_file_reader.rb +49 -0
- data/lib/orc_file_writer.rb +66 -0
- data/lib/orc_options.rb +48 -0
- data/lib/orc_reader_options.rb +9 -0
- data/lib/orc_schema.rb +28 -0
- metadata +87 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: ee87da3cc76f9655fd8393da478578d57143210e
|
4
|
+
data.tar.gz: dcac0b20f2f932694b3eab4a1bc32bcf0e7037d1
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 1a50f7b1a22662b433b213bf0e56d0b012e1c6c07c0773caadf6f6095307fee78ebfd2ac6d94f6d4e699cf1e75ef6c992075e68a51732a3fa010a33d770e5bc5
|
7
|
+
data.tar.gz: b9c8e252962f7215828636923ebead9ca8f2d122cadb5d559cac77d2e453ce7dafcf089f9d9ff05ef0cea9036f2663e97abf766295a9ea8c1c95269a38e4d3c9
|
data/README.rdoc
ADDED
@@ -0,0 +1,99 @@
|
|
1
|
+
=ORCFILE
|
2
|
+
Ruby Gem for reading and writing Apache Optimized Row Columnar (ORC) files.
|
3
|
+
This gem can also be paired using the factory_girl gem.
|
4
|
+
|
5
|
+
==Installation
|
6
|
+
Must use jruby.
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'orc_file'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle install
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install orc_file
|
19
|
+
|
20
|
+
==Usage
|
21
|
+
===+OrcFileWriter+
|
22
|
+
To write a file, you will need to initialize the OrcFileWriter class.
|
23
|
+
This object needs a table schema, your dataset, the path to store the file, and an optional configuration hash.
|
24
|
+
|
25
|
+
OrcFileWriter.new(table_schema, data_set, path, *options={})
|
26
|
+
====_table_schema_
|
27
|
+
The table_schema must be a hash containing the column name and datatype as the key-value pair.
|
28
|
+
|
29
|
+
Valid datatypes are:
|
30
|
+
- integer
|
31
|
+
- decimal
|
32
|
+
- float
|
33
|
+
- date
|
34
|
+
- datetime
|
35
|
+
- time
|
36
|
+
- string
|
37
|
+
|
38
|
+
|
39
|
+
table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
|
40
|
+
|
41
|
+
====_data_set_
|
42
|
+
The data_set must contain a hash with the column name and data value as the key-value pair.
|
43
|
+
|
44
|
+
For one row in the dataset:
|
45
|
+
|
46
|
+
data_set = {:id => 1, :amount => 1000.01, :rate => 0.0005}
|
47
|
+
|
48
|
+
For multiple rows in the dataset:
|
49
|
+
|
50
|
+
dataset = [{:id => 1, :amount => 1000.01, :rate => 0.0005},
|
51
|
+
{:id => 2, :amount => 2500.5, :rate => 0.1},
|
52
|
+
{:id => 3, :amount => 10.12, :rate => 10.0134}]
|
53
|
+
|
54
|
+
====_path_
|
55
|
+
The path should be the full file path or relative to your working directory. You must also specify the file name.
|
56
|
+
|
57
|
+
path = '/temp/orc_file.orc'
|
58
|
+
|
59
|
+
====_options_
|
60
|
+
Options is an optional hash parameter containing 5 configurable settings for writing an ORC file.
|
61
|
+
|
62
|
+
`:stripe_size` defines the size of the stripe, defaulted as 67,108,864 bytes <br>
|
63
|
+
`:row_index_stride` defines the number of rows between row index entries, defaulted as 10,000 <br>
|
64
|
+
`:buffer_size` defines the orc buffer size, defaulted as 262,144 bytes <br>
|
65
|
+
`:compression` defines the compression codec (NONE,ZLIB,SNAPPY,LZO), defaulted as ZLIB. <br>
|
66
|
+
|
67
|
+
Define the options parameter has a hash
|
68
|
+
|
69
|
+
options = {:stripe_size => 70000000, :compression => 'SNAPPY'}
|
70
|
+
|
71
|
+
===+write_to_orc+
|
72
|
+
Once you have the OrcFileWriter object initialized you must call write_to_orc to write out the file
|
73
|
+
|
74
|
+
OrcFileWriter.new(table_schema, data_set, path, options).write_to_orc
|
75
|
+
|
76
|
+
===+OrcFileReader+
|
77
|
+
To read a file, you will need to initialize the OrcFileReader class.
|
78
|
+
This object needs a table schema, and the path of the file to be read.
|
79
|
+
|
80
|
+
OrcFileReader.new(table_schema, path)
|
81
|
+
====_table_schema_
|
82
|
+
The table_schema must be a hash containing the column name and datatype as the key-value pair.
|
83
|
+
|
84
|
+
Valid datatypes are:
|
85
|
+
- integer
|
86
|
+
- decimal
|
87
|
+
- float
|
88
|
+
- date
|
89
|
+
- datetime
|
90
|
+
- time
|
91
|
+
- string
|
92
|
+
|
93
|
+
|
94
|
+
table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
|
95
|
+
|
96
|
+
====_path_
|
97
|
+
The path should be the full file path or relative to your working directory. You must also specify the file name.
|
98
|
+
|
99
|
+
path = '/temp/orc_file.orc'
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/orc_file.rb
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
require 'java'
|
2
|
+
require 'bigdecimal'
|
3
|
+
require 'time'
|
4
|
+
require 'jars/slf4j-api-1.7.9.jar'
|
5
|
+
require 'jars/commons-logging-1.2.jar'
|
6
|
+
require 'jars/commons-configuration-1.10.jar'
|
7
|
+
require 'jars/slf4j-simple-1.7.20.jar'
|
8
|
+
require 'jars/hadoop-core-1.2.1.jar'
|
9
|
+
require 'jars/hive-exec-2.1.1.jar'
|
10
|
+
require 'orc_schema'
|
11
|
+
require 'orc_options'
|
12
|
+
require 'orc_reader_options'
|
13
|
+
require 'orc_file_writer'
|
14
|
+
require 'orc_file_reader'
|
15
|
+
|
16
|
+
java_import 'org.slf4j.LoggerFactory'
|
17
|
+
java_import 'org.apache.hadoop.hive.common.type.HiveDecimal'
|
18
|
+
java_import 'org.apache.hadoop.conf.Configuration'
|
19
|
+
java_import 'org.apache.hadoop.fs.Path'
|
20
|
+
java_import 'org.apache.orc.CompressionKind'
|
21
|
+
java_import 'org.apache.orc.TypeDescription'
|
22
|
+
java_import 'org.apache.orc.OrcFile'
|
23
|
+
|
24
|
+
|
25
|
+
|
@@ -0,0 +1,49 @@
|
|
1
|
+
class OrcFileReader
|
2
|
+
attr_reader :reader, :orc_options, :table_schema
|
3
|
+
|
4
|
+
def initialize(table_schema, path='orc_file.orc')
|
5
|
+
@orc_options = OrcReaderOptions.new
|
6
|
+
@table_schema = table_schema
|
7
|
+
path = Path.new(path)
|
8
|
+
@reader = OrcFile.createReader(path, @orc_options.orc)
|
9
|
+
end
|
10
|
+
|
11
|
+
def read_row(row_batch, row_index)
|
12
|
+
orc_row = {}
|
13
|
+
row_batch.cols.each_with_index do |column, index|
|
14
|
+
column_name = @table_schema.keys[index]
|
15
|
+
data_type = @table_schema[column_name]
|
16
|
+
case data_type
|
17
|
+
when :integer
|
18
|
+
orc_row[column_name] = column.vector[row_index]
|
19
|
+
when :decimal
|
20
|
+
orc_row[column_name] = column.vector[row_index].get_hive_decimal.to_s.to_d
|
21
|
+
when :float
|
22
|
+
#sets float value as 0.0005000000237487257 instead of 0.0005
|
23
|
+
orc_row[column_name] = column.vector[row_index]
|
24
|
+
when :datetime
|
25
|
+
orc_row[column_name] = DateTime.strptime(column.time[row_index].to_s, '%Q').to_time.to_datetime
|
26
|
+
when :time
|
27
|
+
orc_row[column_name] = Time.strptime(column.time[row_index].to_s, '%Q')
|
28
|
+
when :date
|
29
|
+
# orc_row[column_name] = Time.at(column.vector.first * 86400).to_date
|
30
|
+
orc_row[column_name] = Date.new(1970,1,1) + column.vector[row_index]
|
31
|
+
when :string
|
32
|
+
orc_row[column_name] = column.toString(row_index)
|
33
|
+
end
|
34
|
+
end
|
35
|
+
orc_row
|
36
|
+
end
|
37
|
+
|
38
|
+
def read_from_orc
|
39
|
+
rows = Array.new
|
40
|
+
row_batch = @reader.get_schema.createRowBatch()
|
41
|
+
@reader.rows.next_batch(row_batch)
|
42
|
+
|
43
|
+
@reader.number_of_rows.times do |row_index|
|
44
|
+
rows << read_row(row_batch, row_index)
|
45
|
+
end
|
46
|
+
rows
|
47
|
+
end
|
48
|
+
|
49
|
+
end
|
@@ -0,0 +1,66 @@
|
|
1
|
+
require 'orc_options'
|
2
|
+
|
3
|
+
class OrcFileWriter
|
4
|
+
attr_reader :writer, :orc_options, :data_set, :table_schema, :output_path
|
5
|
+
|
6
|
+
def initialize(table_schema, data_set, path='orc_file.orc', options={})
|
7
|
+
@orc_options = OrcOptions.new
|
8
|
+
@orc_options.set_options(options)
|
9
|
+
@table_schema = table_schema
|
10
|
+
@data_set = data_set
|
11
|
+
@orc_options.define_table_schema(table_schema)
|
12
|
+
path.concat '.orc' unless path.include? '.orc'
|
13
|
+
path_object = Path.new(path)
|
14
|
+
@writer = OrcFile.createWriter(path_object, @orc_options.orc)
|
15
|
+
end
|
16
|
+
|
17
|
+
def create_row(row)
|
18
|
+
orc_row = @orc_options.orc_schema.schema.createRowBatch()
|
19
|
+
orc_row.size = 1
|
20
|
+
row.each_with_index do |(key, value), index|
|
21
|
+
data_type = @table_schema[key]
|
22
|
+
if value.nil?
|
23
|
+
orc_row.cols[index].noNulls = false
|
24
|
+
case data_type
|
25
|
+
when :datetime, :time, :decimal
|
26
|
+
orc_row.cols[index].set(0, value)
|
27
|
+
else
|
28
|
+
orc_row.cols[index].fill_with_nulls
|
29
|
+
end
|
30
|
+
else
|
31
|
+
case data_type
|
32
|
+
when :integer
|
33
|
+
data_for_column = value.to_java(:long)
|
34
|
+
when :decimal
|
35
|
+
data_for_column = HiveDecimal.create(value.to_d.to_java)
|
36
|
+
when :float
|
37
|
+
data_for_column = value.to_java(:double)
|
38
|
+
when :datetime, :time
|
39
|
+
data_for_column = value.to_time
|
40
|
+
when :date
|
41
|
+
# hive needs date formated as number of days since epoch (01/01/1970)
|
42
|
+
data_for_column = (value - Date.new(1970, 1, 1)).to_i
|
43
|
+
when :string
|
44
|
+
orc_row.cols[index].initBuffer(value.to_s.bytes.to_a.size)
|
45
|
+
data_for_column = value.to_s.bytes
|
46
|
+
else
|
47
|
+
raise ArgumentError, "column data type #{data_type} not defined"
|
48
|
+
end
|
49
|
+
orc_row.cols[index].fill(data_for_column)
|
50
|
+
end
|
51
|
+
end
|
52
|
+
orc_row
|
53
|
+
end
|
54
|
+
|
55
|
+
def write_to_orc
|
56
|
+
if @data_set.is_a? Array
|
57
|
+
@data_set.each do |row|
|
58
|
+
@writer.addRowBatch(create_row(row))
|
59
|
+
end
|
60
|
+
else
|
61
|
+
@writer.addRowBatch(create_row(@data_set))
|
62
|
+
end
|
63
|
+
@writer.close
|
64
|
+
end
|
65
|
+
|
66
|
+
end
|
data/lib/orc_options.rb
ADDED
@@ -0,0 +1,48 @@
|
|
1
|
+
require 'orc_schema'
|
2
|
+
|
3
|
+
class OrcOptions
|
4
|
+
attr_reader :orc_schema, :orc
|
5
|
+
|
6
|
+
def initialize()
|
7
|
+
conf = Configuration.new
|
8
|
+
@orc_schema = OrcSchema.new
|
9
|
+
@orc = OrcFile.writerOptions(conf)
|
10
|
+
end
|
11
|
+
|
12
|
+
def define_table_schema(table_schema)
|
13
|
+
raise TypeError, 'table_schema must be a Hash of {column_name: data_type}' unless table_schema.is_a? Hash
|
14
|
+
raise ArgumentError, 'table_schema cannot be an empty hash' if table_schema.empty?
|
15
|
+
table_schema.each do |column_name, data_type|
|
16
|
+
@orc_schema.add_column(column_name, data_type)
|
17
|
+
end
|
18
|
+
@orc.setSchema(@orc_schema.schema)
|
19
|
+
end
|
20
|
+
|
21
|
+
def define_stripe_size(stripe_size)
|
22
|
+
@orc.stripeSize(stripe_size)
|
23
|
+
end
|
24
|
+
|
25
|
+
def define_row_index_stride(row_index_stride)
|
26
|
+
@orc.rowIndexStride(row_index_stride)
|
27
|
+
end
|
28
|
+
|
29
|
+
def define_buffer_size(buffer_size)
|
30
|
+
@orc.bufferSize(buffer_size)
|
31
|
+
end
|
32
|
+
|
33
|
+
def define_compression(compression)
|
34
|
+
begin
|
35
|
+
@orc.compress(CompressionKind.valueOf(compression))
|
36
|
+
rescue java.lang.IllegalArgumentException
|
37
|
+
raise ArgumentError, "#{compression} is not a valid CompressionKind. Must be one of the following: \n#{CompressionKind.constants}"
|
38
|
+
end
|
39
|
+
end
|
40
|
+
|
41
|
+
def set_options(opts)
|
42
|
+
define_stripe_size(opts[:stripe_size]) unless opts[:stripe_size].nil?
|
43
|
+
define_row_index_stride(opts[:row_index_stride]) unless opts[:row_index_stride].nil?
|
44
|
+
define_buffer_size(opts[:buffer_size]) unless opts[:buffer_size].nil?
|
45
|
+
define_compression(opts[:compression]) unless opts[:compression].nil?
|
46
|
+
end
|
47
|
+
|
48
|
+
end
|
data/lib/orc_schema.rb
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
class OrcSchema
|
2
|
+
attr_reader :schema
|
3
|
+
def initialize
|
4
|
+
@schema = TypeDescription.createStruct()
|
5
|
+
end
|
6
|
+
|
7
|
+
def add_column(column_name, data_type)
|
8
|
+
case data_type.downcase.to_sym
|
9
|
+
when :integer
|
10
|
+
type = TypeDescription.createLong()
|
11
|
+
when :datetime, :time
|
12
|
+
type = TypeDescription.createTimestamp()
|
13
|
+
when :date
|
14
|
+
type = TypeDescription.createDate()
|
15
|
+
when :decimal
|
16
|
+
type = TypeDescription.createDecimal()
|
17
|
+
when :float
|
18
|
+
type = TypeDescription.createFloat()
|
19
|
+
when :double
|
20
|
+
type = TypeDescription.createDouble()
|
21
|
+
when :string
|
22
|
+
type = TypeDescription.createString()
|
23
|
+
else
|
24
|
+
raise ArgumentError, "column data type #{data_type} not defined"
|
25
|
+
end
|
26
|
+
@schema.addField(column_name.to_s, type)
|
27
|
+
end
|
28
|
+
end
|
metadata
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: orcfile
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 1.0.0
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Andrew Shane
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2019-01-15 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
requirement: !ruby/object:Gem::Requirement
|
15
|
+
requirements:
|
16
|
+
- - ">="
|
17
|
+
- !ruby/object:Gem::Version
|
18
|
+
version: '0'
|
19
|
+
name: rspec
|
20
|
+
prerelease: false
|
21
|
+
type: :runtime
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
requirement: !ruby/object:Gem::Requirement
|
29
|
+
requirements:
|
30
|
+
- - ">="
|
31
|
+
- !ruby/object:Gem::Version
|
32
|
+
version: '0'
|
33
|
+
name: java
|
34
|
+
prerelease: false
|
35
|
+
type: :runtime
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - ">="
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: '0'
|
41
|
+
description: This gem allows for the creation and reading of Apache Hive Optimized
|
42
|
+
Row Columnar (ORC) files.
|
43
|
+
email:
|
44
|
+
- ashane9@gmail.com
|
45
|
+
executables: []
|
46
|
+
extensions: []
|
47
|
+
extra_rdoc_files: []
|
48
|
+
files:
|
49
|
+
- README.rdoc
|
50
|
+
- lib/jars/commons-configuration-1.10.jar
|
51
|
+
- lib/jars/commons-logging-1.2.jar
|
52
|
+
- lib/jars/hadoop-core-1.2.1.jar
|
53
|
+
- lib/jars/hive-exec-2.1.1.jar
|
54
|
+
- lib/jars/slf4j-api-1.7.9.jar
|
55
|
+
- lib/jars/slf4j-simple-1.7.20.jar
|
56
|
+
- lib/orc_file.rb
|
57
|
+
- lib/orc_file_reader.rb
|
58
|
+
- lib/orc_file_writer.rb
|
59
|
+
- lib/orc_options.rb
|
60
|
+
- lib/orc_reader_options.rb
|
61
|
+
- lib/orc_schema.rb
|
62
|
+
homepage:
|
63
|
+
licenses: []
|
64
|
+
metadata:
|
65
|
+
source_code_uri: https://github.com/ashane9/orc_file
|
66
|
+
allowed_push_host: https://rubygems.org
|
67
|
+
post_install_message:
|
68
|
+
rdoc_options: []
|
69
|
+
require_paths:
|
70
|
+
- lib
|
71
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - ">="
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '0'
|
76
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
77
|
+
requirements:
|
78
|
+
- - ">="
|
79
|
+
- !ruby/object:Gem::Version
|
80
|
+
version: '0'
|
81
|
+
requirements: []
|
82
|
+
rubyforge_project:
|
83
|
+
rubygems_version: 2.6.14
|
84
|
+
signing_key:
|
85
|
+
specification_version: 4
|
86
|
+
summary: Reader/writer of Hive ORC files
|
87
|
+
test_files: []
|