orcfile 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ee87da3cc76f9655fd8393da478578d57143210e
4
+ data.tar.gz: dcac0b20f2f932694b3eab4a1bc32bcf0e7037d1
5
+ SHA512:
6
+ metadata.gz: 1a50f7b1a22662b433b213bf0e56d0b012e1c6c07c0773caadf6f6095307fee78ebfd2ac6d94f6d4e699cf1e75ef6c992075e68a51732a3fa010a33d770e5bc5
7
+ data.tar.gz: b9c8e252962f7215828636923ebead9ca8f2d122cadb5d559cac77d2e453ce7dafcf089f9d9ff05ef0cea9036f2663e97abf766295a9ea8c1c95269a38e4d3c9
@@ -0,0 +1,99 @@
1
+ =ORCFILE
2
+ Ruby Gem for reading and writing Apache Optimized Row Columnar (ORC) files.
3
+ This gem can also be paired using the factory_girl gem.
4
+
5
+ ==Installation
6
+ Must use jruby.
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'orc_file'
11
+
12
+ And then execute:
13
+
14
+ $ bundle install
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install orc_file
19
+
20
+ ==Usage
21
+ ===+OrcFileWriter+
22
+ To write a file, you will need to initialize the OrcFileWriter class.
23
+ This object needs a table schema, your dataset, the path to store the file, and an optional configuration hash.
24
+
25
+ OrcFileWriter.new(table_schema, data_set, path, *options={})
26
+ ====_table_schema_
27
+ The table_schema must be a hash containing the column name and datatype as the key-value pair.
28
+
29
+ Valid datatypes are:
30
+ - integer
31
+ - decimal
32
+ - float
33
+ - date
34
+ - datetime
35
+ - time
36
+ - string
37
+
38
+
39
+ table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
40
+
41
+ ====_data_set_
42
+ The data_set must contain a hash with the column name and data value as the key-value pair.
43
+
44
+ For one row in the dataset:
45
+
46
+ data_set = {:id => 1, :amount => 1000.01, :rate => 0.0005}
47
+
48
+ For multiple rows in the dataset:
49
+
50
+ dataset = [{:id => 1, :amount => 1000.01, :rate => 0.0005},
51
+ {:id => 2, :amount => 2500.5, :rate => 0.1},
52
+ {:id => 3, :amount => 10.12, :rate => 10.0134}]
53
+
54
+ ====_path_
55
+ The path should be the full file path or relative to your working directory. You must also specify the file name.
56
+
57
+ path = '/temp/orc_file.orc'
58
+
59
+ ====_options_
60
+ Options is an optional hash parameter containing 5 configurable settings for writing an ORC file.
61
+
62
+ `:stripe_size` defines the size of the stripe, defaulted as 67,108,864 bytes <br>
63
+ `:row_index_stride` defines the number of rows between row index entries, defaulted as 10,000 <br>
64
+ `:buffer_size` defines the orc buffer size, defaulted as 262,144 bytes <br>
65
+ `:compression` defines the compression codec (NONE,ZLIB,SNAPPY,LZO), defaulted as ZLIB. <br>
66
+
67
+ Define the options parameter has a hash
68
+
69
+ options = {:stripe_size => 70000000, :compression => 'SNAPPY'}
70
+
71
+ ===+write_to_orc+
72
+ Once you have the OrcFileWriter object initialized you must call write_to_orc to write out the file
73
+
74
+ OrcFileWriter.new(table_schema, data_set, path, options).write_to_orc
75
+
76
+ ===+OrcFileReader+
77
+ To read a file, you will need to initialize the OrcFileReader class.
78
+ This object needs a table schema, and the path of the file to be read.
79
+
80
+ OrcFileReader.new(table_schema, path)
81
+ ====_table_schema_
82
+ The table_schema must be a hash containing the column name and datatype as the key-value pair.
83
+
84
+ Valid datatypes are:
85
+ - integer
86
+ - decimal
87
+ - float
88
+ - date
89
+ - datetime
90
+ - time
91
+ - string
92
+
93
+
94
+ table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
95
+
96
+ ====_path_
97
+ The path should be the full file path or relative to your working directory. You must also specify the file name.
98
+
99
+ path = '/temp/orc_file.orc'
@@ -0,0 +1,25 @@
1
+ require 'java'
2
+ require 'bigdecimal'
3
+ require 'time'
4
+ require 'jars/slf4j-api-1.7.9.jar'
5
+ require 'jars/commons-logging-1.2.jar'
6
+ require 'jars/commons-configuration-1.10.jar'
7
+ require 'jars/slf4j-simple-1.7.20.jar'
8
+ require 'jars/hadoop-core-1.2.1.jar'
9
+ require 'jars/hive-exec-2.1.1.jar'
10
+ require 'orc_schema'
11
+ require 'orc_options'
12
+ require 'orc_reader_options'
13
+ require 'orc_file_writer'
14
+ require 'orc_file_reader'
15
+
16
+ java_import 'org.slf4j.LoggerFactory'
17
+ java_import 'org.apache.hadoop.hive.common.type.HiveDecimal'
18
+ java_import 'org.apache.hadoop.conf.Configuration'
19
+ java_import 'org.apache.hadoop.fs.Path'
20
+ java_import 'org.apache.orc.CompressionKind'
21
+ java_import 'org.apache.orc.TypeDescription'
22
+ java_import 'org.apache.orc.OrcFile'
23
+
24
+
25
+
@@ -0,0 +1,49 @@
1
+ class OrcFileReader
2
+ attr_reader :reader, :orc_options, :table_schema
3
+
4
+ def initialize(table_schema, path='orc_file.orc')
5
+ @orc_options = OrcReaderOptions.new
6
+ @table_schema = table_schema
7
+ path = Path.new(path)
8
+ @reader = OrcFile.createReader(path, @orc_options.orc)
9
+ end
10
+
11
+ def read_row(row_batch, row_index)
12
+ orc_row = {}
13
+ row_batch.cols.each_with_index do |column, index|
14
+ column_name = @table_schema.keys[index]
15
+ data_type = @table_schema[column_name]
16
+ case data_type
17
+ when :integer
18
+ orc_row[column_name] = column.vector[row_index]
19
+ when :decimal
20
+ orc_row[column_name] = column.vector[row_index].get_hive_decimal.to_s.to_d
21
+ when :float
22
+ #sets float value as 0.0005000000237487257 instead of 0.0005
23
+ orc_row[column_name] = column.vector[row_index]
24
+ when :datetime
25
+ orc_row[column_name] = DateTime.strptime(column.time[row_index].to_s, '%Q').to_time.to_datetime
26
+ when :time
27
+ orc_row[column_name] = Time.strptime(column.time[row_index].to_s, '%Q')
28
+ when :date
29
+ # orc_row[column_name] = Time.at(column.vector.first * 86400).to_date
30
+ orc_row[column_name] = Date.new(1970,1,1) + column.vector[row_index]
31
+ when :string
32
+ orc_row[column_name] = column.toString(row_index)
33
+ end
34
+ end
35
+ orc_row
36
+ end
37
+
38
+ def read_from_orc
39
+ rows = Array.new
40
+ row_batch = @reader.get_schema.createRowBatch()
41
+ @reader.rows.next_batch(row_batch)
42
+
43
+ @reader.number_of_rows.times do |row_index|
44
+ rows << read_row(row_batch, row_index)
45
+ end
46
+ rows
47
+ end
48
+
49
+ end
@@ -0,0 +1,66 @@
1
+ require 'orc_options'
2
+
3
+ class OrcFileWriter
4
+ attr_reader :writer, :orc_options, :data_set, :table_schema, :output_path
5
+
6
+ def initialize(table_schema, data_set, path='orc_file.orc', options={})
7
+ @orc_options = OrcOptions.new
8
+ @orc_options.set_options(options)
9
+ @table_schema = table_schema
10
+ @data_set = data_set
11
+ @orc_options.define_table_schema(table_schema)
12
+ path.concat '.orc' unless path.include? '.orc'
13
+ path_object = Path.new(path)
14
+ @writer = OrcFile.createWriter(path_object, @orc_options.orc)
15
+ end
16
+
17
+ def create_row(row)
18
+ orc_row = @orc_options.orc_schema.schema.createRowBatch()
19
+ orc_row.size = 1
20
+ row.each_with_index do |(key, value), index|
21
+ data_type = @table_schema[key]
22
+ if value.nil?
23
+ orc_row.cols[index].noNulls = false
24
+ case data_type
25
+ when :datetime, :time, :decimal
26
+ orc_row.cols[index].set(0, value)
27
+ else
28
+ orc_row.cols[index].fill_with_nulls
29
+ end
30
+ else
31
+ case data_type
32
+ when :integer
33
+ data_for_column = value.to_java(:long)
34
+ when :decimal
35
+ data_for_column = HiveDecimal.create(value.to_d.to_java)
36
+ when :float
37
+ data_for_column = value.to_java(:double)
38
+ when :datetime, :time
39
+ data_for_column = value.to_time
40
+ when :date
41
+ # hive needs date formated as number of days since epoch (01/01/1970)
42
+ data_for_column = (value - Date.new(1970, 1, 1)).to_i
43
+ when :string
44
+ orc_row.cols[index].initBuffer(value.to_s.bytes.to_a.size)
45
+ data_for_column = value.to_s.bytes
46
+ else
47
+ raise ArgumentError, "column data type #{data_type} not defined"
48
+ end
49
+ orc_row.cols[index].fill(data_for_column)
50
+ end
51
+ end
52
+ orc_row
53
+ end
54
+
55
+ def write_to_orc
56
+ if @data_set.is_a? Array
57
+ @data_set.each do |row|
58
+ @writer.addRowBatch(create_row(row))
59
+ end
60
+ else
61
+ @writer.addRowBatch(create_row(@data_set))
62
+ end
63
+ @writer.close
64
+ end
65
+
66
+ end
@@ -0,0 +1,48 @@
1
+ require 'orc_schema'
2
+
3
+ class OrcOptions
4
+ attr_reader :orc_schema, :orc
5
+
6
+ def initialize()
7
+ conf = Configuration.new
8
+ @orc_schema = OrcSchema.new
9
+ @orc = OrcFile.writerOptions(conf)
10
+ end
11
+
12
+ def define_table_schema(table_schema)
13
+ raise TypeError, 'table_schema must be a Hash of {column_name: data_type}' unless table_schema.is_a? Hash
14
+ raise ArgumentError, 'table_schema cannot be an empty hash' if table_schema.empty?
15
+ table_schema.each do |column_name, data_type|
16
+ @orc_schema.add_column(column_name, data_type)
17
+ end
18
+ @orc.setSchema(@orc_schema.schema)
19
+ end
20
+
21
+ def define_stripe_size(stripe_size)
22
+ @orc.stripeSize(stripe_size)
23
+ end
24
+
25
+ def define_row_index_stride(row_index_stride)
26
+ @orc.rowIndexStride(row_index_stride)
27
+ end
28
+
29
+ def define_buffer_size(buffer_size)
30
+ @orc.bufferSize(buffer_size)
31
+ end
32
+
33
+ def define_compression(compression)
34
+ begin
35
+ @orc.compress(CompressionKind.valueOf(compression))
36
+ rescue java.lang.IllegalArgumentException
37
+ raise ArgumentError, "#{compression} is not a valid CompressionKind. Must be one of the following: \n#{CompressionKind.constants}"
38
+ end
39
+ end
40
+
41
+ def set_options(opts)
42
+ define_stripe_size(opts[:stripe_size]) unless opts[:stripe_size].nil?
43
+ define_row_index_stride(opts[:row_index_stride]) unless opts[:row_index_stride].nil?
44
+ define_buffer_size(opts[:buffer_size]) unless opts[:buffer_size].nil?
45
+ define_compression(opts[:compression]) unless opts[:compression].nil?
46
+ end
47
+
48
+ end
@@ -0,0 +1,9 @@
1
+ class OrcReaderOptions
2
+ attr_reader :orc, :orc_schema
3
+
4
+ def initialize()
5
+ conf = Configuration.new
6
+ @orc_schema = OrcSchema.new
7
+ @orc = OrcFile.readerOptions(conf)
8
+ end
9
+ end
@@ -0,0 +1,28 @@
1
+ class OrcSchema
2
+ attr_reader :schema
3
+ def initialize
4
+ @schema = TypeDescription.createStruct()
5
+ end
6
+
7
+ def add_column(column_name, data_type)
8
+ case data_type.downcase.to_sym
9
+ when :integer
10
+ type = TypeDescription.createLong()
11
+ when :datetime, :time
12
+ type = TypeDescription.createTimestamp()
13
+ when :date
14
+ type = TypeDescription.createDate()
15
+ when :decimal
16
+ type = TypeDescription.createDecimal()
17
+ when :float
18
+ type = TypeDescription.createFloat()
19
+ when :double
20
+ type = TypeDescription.createDouble()
21
+ when :string
22
+ type = TypeDescription.createString()
23
+ else
24
+ raise ArgumentError, "column data type #{data_type} not defined"
25
+ end
26
+ @schema.addField(column_name.to_s, type)
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: orcfile
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Shane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-01-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: rspec
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ name: java
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: This gem allows for the creation and reading of Apache Hive Optimized
42
+ Row Columnar (ORC) files.
43
+ email:
44
+ - ashane9@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - README.rdoc
50
+ - lib/jars/commons-configuration-1.10.jar
51
+ - lib/jars/commons-logging-1.2.jar
52
+ - lib/jars/hadoop-core-1.2.1.jar
53
+ - lib/jars/hive-exec-2.1.1.jar
54
+ - lib/jars/slf4j-api-1.7.9.jar
55
+ - lib/jars/slf4j-simple-1.7.20.jar
56
+ - lib/orc_file.rb
57
+ - lib/orc_file_reader.rb
58
+ - lib/orc_file_writer.rb
59
+ - lib/orc_options.rb
60
+ - lib/orc_reader_options.rb
61
+ - lib/orc_schema.rb
62
+ homepage:
63
+ licenses: []
64
+ metadata:
65
+ source_code_uri: https://github.com/ashane9/orc_file
66
+ allowed_push_host: https://rubygems.org
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.6.14
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Reader/writer of Hive ORC files
87
+ test_files: []