orcfile 1.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA1:
3
+ metadata.gz: ee87da3cc76f9655fd8393da478578d57143210e
4
+ data.tar.gz: dcac0b20f2f932694b3eab4a1bc32bcf0e7037d1
5
+ SHA512:
6
+ metadata.gz: 1a50f7b1a22662b433b213bf0e56d0b012e1c6c07c0773caadf6f6095307fee78ebfd2ac6d94f6d4e699cf1e75ef6c992075e68a51732a3fa010a33d770e5bc5
7
+ data.tar.gz: b9c8e252962f7215828636923ebead9ca8f2d122cadb5d559cac77d2e453ce7dafcf089f9d9ff05ef0cea9036f2663e97abf766295a9ea8c1c95269a38e4d3c9
@@ -0,0 +1,99 @@
1
+ =ORCFILE
2
+ Ruby Gem for reading and writing Apache Optimized Row Columnar (ORC) files.
3
+ This gem can also be paired using the factory_girl gem.
4
+
5
+ ==Installation
6
+ Must use jruby.
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'orc_file'
11
+
12
+ And then execute:
13
+
14
+ $ bundle install
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install orc_file
19
+
20
+ ==Usage
21
+ ===+OrcFileWriter+
22
+ To write a file, you will need to initialize the OrcFileWriter class.
23
+ This object needs a table schema, your dataset, the path to store the file, and an optional configuration hash.
24
+
25
+ OrcFileWriter.new(table_schema, data_set, path, *options={})
26
+ ====_table_schema_
27
+ The table_schema must be a hash containing the column name and datatype as the key-value pair.
28
+
29
+ Valid datatypes are:
30
+ - integer
31
+ - decimal
32
+ - float
33
+ - date
34
+ - datetime
35
+ - time
36
+ - string
37
+
38
+
39
+ table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
40
+
41
+ ====_data_set_
42
+ The data_set must contain a hash with the column name and data value as the key-value pair.
43
+
44
+ For one row in the dataset:
45
+
46
+ data_set = {:id => 1, :amount => 1000.01, :rate => 0.0005}
47
+
48
+ For multiple rows in the dataset:
49
+
50
+ dataset = [{:id => 1, :amount => 1000.01, :rate => 0.0005},
51
+ {:id => 2, :amount => 2500.5, :rate => 0.1},
52
+ {:id => 3, :amount => 10.12, :rate => 10.0134}]
53
+
54
+ ====_path_
55
+ The path should be the full file path or relative to your working directory. You must also specify the file name.
56
+
57
+ path = '/temp/orc_file.orc'
58
+
59
+ ====_options_
60
+ Options is an optional hash parameter containing 5 configurable settings for writing an ORC file.
61
+
62
+ `:stripe_size` defines the size of the stripe, defaulted as 67,108,864 bytes <br>
63
+ `:row_index_stride` defines the number of rows between row index entries, defaulted as 10,000 <br>
64
+ `:buffer_size` defines the orc buffer size, defaulted as 262,144 bytes <br>
65
+ `:compression` defines the compression codec (NONE,ZLIB,SNAPPY,LZO), defaulted as ZLIB. <br>
66
+
67
+ Define the options parameter has a hash
68
+
69
+ options = {:stripe_size => 70000000, :compression => 'SNAPPY'}
70
+
71
+ ===+write_to_orc+
72
+ Once you have the OrcFileWriter object initialized you must call write_to_orc to write out the file
73
+
74
+ OrcFileWriter.new(table_schema, data_set, path, options).write_to_orc
75
+
76
+ ===+OrcFileReader+
77
+ To read a file, you will need to initialize the OrcFileReader class.
78
+ This object needs a table schema, and the path of the file to be read.
79
+
80
+ OrcFileReader.new(table_schema, path)
81
+ ====_table_schema_
82
+ The table_schema must be a hash containing the column name and datatype as the key-value pair.
83
+
84
+ Valid datatypes are:
85
+ - integer
86
+ - decimal
87
+ - float
88
+ - date
89
+ - datetime
90
+ - time
91
+ - string
92
+
93
+
94
+ table_schema = {:id => :integer, :amount => :decimal, :rate => :float}
95
+
96
+ ====_path_
97
+ The path should be the full file path or relative to your working directory. You must also specify the file name.
98
+
99
+ path = '/temp/orc_file.orc'
@@ -0,0 +1,25 @@
1
+ require 'java'
2
+ require 'bigdecimal'
3
+ require 'time'
4
+ require 'jars/slf4j-api-1.7.9.jar'
5
+ require 'jars/commons-logging-1.2.jar'
6
+ require 'jars/commons-configuration-1.10.jar'
7
+ require 'jars/slf4j-simple-1.7.20.jar'
8
+ require 'jars/hadoop-core-1.2.1.jar'
9
+ require 'jars/hive-exec-2.1.1.jar'
10
+ require 'orc_schema'
11
+ require 'orc_options'
12
+ require 'orc_reader_options'
13
+ require 'orc_file_writer'
14
+ require 'orc_file_reader'
15
+
16
+ java_import 'org.slf4j.LoggerFactory'
17
+ java_import 'org.apache.hadoop.hive.common.type.HiveDecimal'
18
+ java_import 'org.apache.hadoop.conf.Configuration'
19
+ java_import 'org.apache.hadoop.fs.Path'
20
+ java_import 'org.apache.orc.CompressionKind'
21
+ java_import 'org.apache.orc.TypeDescription'
22
+ java_import 'org.apache.orc.OrcFile'
23
+
24
+
25
+
@@ -0,0 +1,49 @@
1
+ class OrcFileReader
2
+ attr_reader :reader, :orc_options, :table_schema
3
+
4
+ def initialize(table_schema, path='orc_file.orc')
5
+ @orc_options = OrcReaderOptions.new
6
+ @table_schema = table_schema
7
+ path = Path.new(path)
8
+ @reader = OrcFile.createReader(path, @orc_options.orc)
9
+ end
10
+
11
+ def read_row(row_batch, row_index)
12
+ orc_row = {}
13
+ row_batch.cols.each_with_index do |column, index|
14
+ column_name = @table_schema.keys[index]
15
+ data_type = @table_schema[column_name]
16
+ case data_type
17
+ when :integer
18
+ orc_row[column_name] = column.vector[row_index]
19
+ when :decimal
20
+ orc_row[column_name] = column.vector[row_index].get_hive_decimal.to_s.to_d
21
+ when :float
22
+ #sets float value as 0.0005000000237487257 instead of 0.0005
23
+ orc_row[column_name] = column.vector[row_index]
24
+ when :datetime
25
+ orc_row[column_name] = DateTime.strptime(column.time[row_index].to_s, '%Q').to_time.to_datetime
26
+ when :time
27
+ orc_row[column_name] = Time.strptime(column.time[row_index].to_s, '%Q')
28
+ when :date
29
+ # orc_row[column_name] = Time.at(column.vector.first * 86400).to_date
30
+ orc_row[column_name] = Date.new(1970,1,1) + column.vector[row_index]
31
+ when :string
32
+ orc_row[column_name] = column.toString(row_index)
33
+ end
34
+ end
35
+ orc_row
36
+ end
37
+
38
+ def read_from_orc
39
+ rows = Array.new
40
+ row_batch = @reader.get_schema.createRowBatch()
41
+ @reader.rows.next_batch(row_batch)
42
+
43
+ @reader.number_of_rows.times do |row_index|
44
+ rows << read_row(row_batch, row_index)
45
+ end
46
+ rows
47
+ end
48
+
49
+ end
@@ -0,0 +1,66 @@
1
+ require 'orc_options'
2
+
3
+ class OrcFileWriter
4
+ attr_reader :writer, :orc_options, :data_set, :table_schema, :output_path
5
+
6
+ def initialize(table_schema, data_set, path='orc_file.orc', options={})
7
+ @orc_options = OrcOptions.new
8
+ @orc_options.set_options(options)
9
+ @table_schema = table_schema
10
+ @data_set = data_set
11
+ @orc_options.define_table_schema(table_schema)
12
+ path.concat '.orc' unless path.include? '.orc'
13
+ path_object = Path.new(path)
14
+ @writer = OrcFile.createWriter(path_object, @orc_options.orc)
15
+ end
16
+
17
+ def create_row(row)
18
+ orc_row = @orc_options.orc_schema.schema.createRowBatch()
19
+ orc_row.size = 1
20
+ row.each_with_index do |(key, value), index|
21
+ data_type = @table_schema[key]
22
+ if value.nil?
23
+ orc_row.cols[index].noNulls = false
24
+ case data_type
25
+ when :datetime, :time, :decimal
26
+ orc_row.cols[index].set(0, value)
27
+ else
28
+ orc_row.cols[index].fill_with_nulls
29
+ end
30
+ else
31
+ case data_type
32
+ when :integer
33
+ data_for_column = value.to_java(:long)
34
+ when :decimal
35
+ data_for_column = HiveDecimal.create(value.to_d.to_java)
36
+ when :float
37
+ data_for_column = value.to_java(:double)
38
+ when :datetime, :time
39
+ data_for_column = value.to_time
40
+ when :date
41
+ # hive needs date formated as number of days since epoch (01/01/1970)
42
+ data_for_column = (value - Date.new(1970, 1, 1)).to_i
43
+ when :string
44
+ orc_row.cols[index].initBuffer(value.to_s.bytes.to_a.size)
45
+ data_for_column = value.to_s.bytes
46
+ else
47
+ raise ArgumentError, "column data type #{data_type} not defined"
48
+ end
49
+ orc_row.cols[index].fill(data_for_column)
50
+ end
51
+ end
52
+ orc_row
53
+ end
54
+
55
+ def write_to_orc
56
+ if @data_set.is_a? Array
57
+ @data_set.each do |row|
58
+ @writer.addRowBatch(create_row(row))
59
+ end
60
+ else
61
+ @writer.addRowBatch(create_row(@data_set))
62
+ end
63
+ @writer.close
64
+ end
65
+
66
+ end
@@ -0,0 +1,48 @@
1
+ require 'orc_schema'
2
+
3
+ class OrcOptions
4
+ attr_reader :orc_schema, :orc
5
+
6
+ def initialize()
7
+ conf = Configuration.new
8
+ @orc_schema = OrcSchema.new
9
+ @orc = OrcFile.writerOptions(conf)
10
+ end
11
+
12
+ def define_table_schema(table_schema)
13
+ raise TypeError, 'table_schema must be a Hash of {column_name: data_type}' unless table_schema.is_a? Hash
14
+ raise ArgumentError, 'table_schema cannot be an empty hash' if table_schema.empty?
15
+ table_schema.each do |column_name, data_type|
16
+ @orc_schema.add_column(column_name, data_type)
17
+ end
18
+ @orc.setSchema(@orc_schema.schema)
19
+ end
20
+
21
+ def define_stripe_size(stripe_size)
22
+ @orc.stripeSize(stripe_size)
23
+ end
24
+
25
+ def define_row_index_stride(row_index_stride)
26
+ @orc.rowIndexStride(row_index_stride)
27
+ end
28
+
29
+ def define_buffer_size(buffer_size)
30
+ @orc.bufferSize(buffer_size)
31
+ end
32
+
33
+ def define_compression(compression)
34
+ begin
35
+ @orc.compress(CompressionKind.valueOf(compression))
36
+ rescue java.lang.IllegalArgumentException
37
+ raise ArgumentError, "#{compression} is not a valid CompressionKind. Must be one of the following: \n#{CompressionKind.constants}"
38
+ end
39
+ end
40
+
41
+ def set_options(opts)
42
+ define_stripe_size(opts[:stripe_size]) unless opts[:stripe_size].nil?
43
+ define_row_index_stride(opts[:row_index_stride]) unless opts[:row_index_stride].nil?
44
+ define_buffer_size(opts[:buffer_size]) unless opts[:buffer_size].nil?
45
+ define_compression(opts[:compression]) unless opts[:compression].nil?
46
+ end
47
+
48
+ end
@@ -0,0 +1,9 @@
1
+ class OrcReaderOptions
2
+ attr_reader :orc, :orc_schema
3
+
4
+ def initialize()
5
+ conf = Configuration.new
6
+ @orc_schema = OrcSchema.new
7
+ @orc = OrcFile.readerOptions(conf)
8
+ end
9
+ end
@@ -0,0 +1,28 @@
1
+ class OrcSchema
2
+ attr_reader :schema
3
+ def initialize
4
+ @schema = TypeDescription.createStruct()
5
+ end
6
+
7
+ def add_column(column_name, data_type)
8
+ case data_type.downcase.to_sym
9
+ when :integer
10
+ type = TypeDescription.createLong()
11
+ when :datetime, :time
12
+ type = TypeDescription.createTimestamp()
13
+ when :date
14
+ type = TypeDescription.createDate()
15
+ when :decimal
16
+ type = TypeDescription.createDecimal()
17
+ when :float
18
+ type = TypeDescription.createFloat()
19
+ when :double
20
+ type = TypeDescription.createDouble()
21
+ when :string
22
+ type = TypeDescription.createString()
23
+ else
24
+ raise ArgumentError, "column data type #{data_type} not defined"
25
+ end
26
+ @schema.addField(column_name.to_s, type)
27
+ end
28
+ end
metadata ADDED
@@ -0,0 +1,87 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: orcfile
3
+ version: !ruby/object:Gem::Version
4
+ version: 1.0.0
5
+ platform: ruby
6
+ authors:
7
+ - Andrew Shane
8
+ autorequire:
9
+ bindir: bin
10
+ cert_chain: []
11
+ date: 2019-01-15 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '0'
19
+ name: rspec
20
+ prerelease: false
21
+ type: :runtime
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
27
+ - !ruby/object:Gem::Dependency
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ name: java
34
+ prerelease: false
35
+ type: :runtime
36
+ version_requirements: !ruby/object:Gem::Requirement
37
+ requirements:
38
+ - - ">="
39
+ - !ruby/object:Gem::Version
40
+ version: '0'
41
+ description: This gem allows for the creation and reading of Apache Hive Optimized
42
+ Row Columnar (ORC) files.
43
+ email:
44
+ - ashane9@gmail.com
45
+ executables: []
46
+ extensions: []
47
+ extra_rdoc_files: []
48
+ files:
49
+ - README.rdoc
50
+ - lib/jars/commons-configuration-1.10.jar
51
+ - lib/jars/commons-logging-1.2.jar
52
+ - lib/jars/hadoop-core-1.2.1.jar
53
+ - lib/jars/hive-exec-2.1.1.jar
54
+ - lib/jars/slf4j-api-1.7.9.jar
55
+ - lib/jars/slf4j-simple-1.7.20.jar
56
+ - lib/orc_file.rb
57
+ - lib/orc_file_reader.rb
58
+ - lib/orc_file_writer.rb
59
+ - lib/orc_options.rb
60
+ - lib/orc_reader_options.rb
61
+ - lib/orc_schema.rb
62
+ homepage:
63
+ licenses: []
64
+ metadata:
65
+ source_code_uri: https://github.com/ashane9/orc_file
66
+ allowed_push_host: https://rubygems.org
67
+ post_install_message:
68
+ rdoc_options: []
69
+ require_paths:
70
+ - lib
71
+ required_ruby_version: !ruby/object:Gem::Requirement
72
+ requirements:
73
+ - - ">="
74
+ - !ruby/object:Gem::Version
75
+ version: '0'
76
+ required_rubygems_version: !ruby/object:Gem::Requirement
77
+ requirements:
78
+ - - ">="
79
+ - !ruby/object:Gem::Version
80
+ version: '0'
81
+ requirements: []
82
+ rubyforge_project:
83
+ rubygems_version: 2.6.14
84
+ signing_key:
85
+ specification_version: 4
86
+ summary: Reader/writer of Hive ORC files
87
+ test_files: []