daru-io 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +2 -0
  4. data/.rspec_formatter.rb +24 -0
  5. data/.rubocop.yml +109 -0
  6. data/.travis.yml +30 -0
  7. data/.yardopts +2 -0
  8. data/CODE_OF_CONDUCT.md +46 -0
  9. data/CONTRIBUTING.md +65 -0
  10. data/Gemfile +20 -0
  11. data/Guardfile +7 -0
  12. data/LICENSE.md +21 -0
  13. data/README.md +654 -0
  14. data/Rakefile +12 -0
  15. data/daru-io.gemspec +39 -0
  16. data/lib/daru/io.rb +3 -0
  17. data/lib/daru/io/base.rb +45 -0
  18. data/lib/daru/io/exporters.rb +1 -0
  19. data/lib/daru/io/exporters/avro.rb +96 -0
  20. data/lib/daru/io/exporters/base.rb +54 -0
  21. data/lib/daru/io/exporters/csv.rb +103 -0
  22. data/lib/daru/io/exporters/excel.rb +148 -0
  23. data/lib/daru/io/exporters/json.rb +570 -0
  24. data/lib/daru/io/exporters/r_data.rb +66 -0
  25. data/lib/daru/io/exporters/rds.rb +79 -0
  26. data/lib/daru/io/exporters/sql.rb +55 -0
  27. data/lib/daru/io/importers.rb +1 -0
  28. data/lib/daru/io/importers/active_record.rb +75 -0
  29. data/lib/daru/io/importers/avro.rb +54 -0
  30. data/lib/daru/io/importers/base.rb +62 -0
  31. data/lib/daru/io/importers/csv.rb +190 -0
  32. data/lib/daru/io/importers/excel.rb +99 -0
  33. data/lib/daru/io/importers/excelx.rb +138 -0
  34. data/lib/daru/io/importers/html.rb +144 -0
  35. data/lib/daru/io/importers/json.rb +152 -0
  36. data/lib/daru/io/importers/mongo.rb +139 -0
  37. data/lib/daru/io/importers/plaintext.rb +97 -0
  38. data/lib/daru/io/importers/r_data.rb +74 -0
  39. data/lib/daru/io/importers/rds.rb +67 -0
  40. data/lib/daru/io/importers/redis.rb +135 -0
  41. data/lib/daru/io/importers/sql.rb +127 -0
  42. data/lib/daru/io/link.rb +80 -0
  43. data/lib/daru/io/version.rb +5 -0
  44. metadata +269 -0
@@ -0,0 +1,12 @@
1
+ require 'bundler/setup'
2
+
3
+ require 'rubygems/tasks'
4
+ Gem::Tasks.new
5
+
6
+ require 'rspec/core/rake_task'
7
+ RSpec::Core::RakeTask.new
8
+
9
+ require 'rubocop/rake_task'
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %w[spec rubocop]
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'daru/io/version'
6
+
7
+ Daru::IO::DESCRIPTION = <<MSG.freeze
8
+ Daru-IO is a plugin-gem to Daru gem, which stands for Data Analysis in RUby. Daru-IO extends support for many Import and Export methods of Daru::DataFrame. This gem is intended to help Rubyists who are into Data Analysis or Web Development, by serving as a general purpose conversion library, while also making it incredibly easy to getting started on analyzing data with daru.
9
+ MSG
10
+
11
+ Gem::Specification.new do |spec|
12
+ spec.name = 'daru-io'
13
+ spec.version = Daru::IO::VERSION
14
+ spec.authors = ['Athitya Kumar']
15
+ spec.email = ['athityakumar@gmail.com']
16
+ spec.summary = 'Daru-IO is a plugin-gem to Daru gem, which stands for Data Analysis in RUby.'
17
+ spec.description = Daru::IO::DESCRIPTION
18
+ spec.homepage = 'https://github.com/athityakumar/daru-io'
19
+ spec.license = 'MIT'
20
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
21
+ spec.bindir = 'bin'
22
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_runtime_dependency 'daru', '~> 0.1.5'
26
+
27
+ spec.add_development_dependency 'bundler', '~> 1.15'
28
+ spec.add_development_dependency 'rake', '~> 10.0'
29
+ spec.add_development_dependency 'redcarpet'
30
+ spec.add_development_dependency 'rspec', '~> 3.0'
31
+ spec.add_development_dependency 'rspec-its'
32
+ spec.add_development_dependency 'rubocop', '>= 0.40.0'
33
+ spec.add_development_dependency 'rubocop-rspec'
34
+ spec.add_development_dependency 'rubygems-tasks'
35
+ spec.add_development_dependency 'simplecov'
36
+ spec.add_development_dependency 'webmock'
37
+ spec.add_development_dependency 'yard'
38
+ spec.add_development_dependency 'guard-rspec' if RUBY_VERSION >= '2.2.5'
39
+ end
@@ -0,0 +1,3 @@
1
+ require 'daru/io/version'
2
+ require 'daru/io/importers'
3
+ require 'daru/io/exporters'
@@ -0,0 +1,45 @@
1
+ require 'daru'
2
+ require 'daru/io/link'
3
+
4
+ module Daru
5
+ module IO
6
+ # Base IO Class that contains generic helper methods, to be
7
+ # used by other {Importers::Base} and {Exporters::Base} via inheritence
8
+ class Base
9
+ # Specifies and requires a gem, if the gem is present in the application
10
+ # environment. Else, raises `LoadError` with meaningful message of which
11
+ # dependency to install for which Daru-IO module.
12
+ #
13
+ # @param dependency [String] A dependency to specify with `gem` command
14
+ # @param version [String] A version range to specify with `gem` command
15
+ # @param requires [String] The gem name to be required, in case it's
16
+ # different from the dependency name. For example, activerecord
17
+ # dependency has to be required as `require 'active_record'`
18
+ # @param callback [Class] The Daru-IO module which is being used currently.
19
+ # Useful for throwing meaningful `LoadError` message.
20
+ #
21
+ # @example Requires with dependency
22
+ # optional_gem 'avro'
23
+ # #=> true
24
+ #
25
+ # @example Requires with version and requires
26
+ # optional_gem 'activerecord', '~> 4.0', requires: 'active_record'
27
+ # #=> true
28
+ #
29
+ # @example Raises error with meaningful message
30
+ # df = Daru::DataFrame.from_json('path/to/file.json')
31
+ # #=> LoadError: Please install the jsonpath gem, or add it to the
32
+ # # Gemfile to use the Daru::IO::Importers::JSON module.
33
+ def optional_gem(dependency, version=nil, requires: nil,
34
+ callback: self.class.name)
35
+ gem dependency, version
36
+ require requires || dependency
37
+ rescue LoadError
38
+ version = version.nil? ? '' : " #{version} version"
39
+ raise LoadError,
40
+ "Please install the #{dependency} gem#{version}, "\
41
+ "or add it to the Gemfile to use the #{callback} module."
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1 @@
1
+ Dir["#{__dir__}/exporters/*.rb"].each { |file| require "daru/io#{file.gsub(__dir__, '')}" }
@@ -0,0 +1,96 @@
1
+ require 'daru/io/exporters/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # Avro Exporter Class, that extends `to_avro_string` and `write_avro` methods to
7
+ # `Daru::DataFrame` instance variables
8
+ class Avro < Base
9
+ Daru::DataFrame.register_io_module :to_avro_string, self
10
+ Daru::DataFrame.register_io_module :write_avro, self
11
+
12
+ # Initializes an Avro Exporter instance.
13
+ #
14
+ # @param dataframe [Daru::DataFrame] A dataframe to export
15
+ # @param schema [Avro::Schema or Hash] The schema should contain details such as `:type`,
16
+ # `:name` and `:fields`
17
+ #
18
+ # @return A `Daru::IO::Exporter::Avro` instance
19
+ #
20
+ # @example Initializing an Avro Exporter
21
+ # schema = {
22
+ # "type" => "record",
23
+ # "name" => "User",
24
+ # "fields" => [
25
+ # {"name" => "name", "type" => "string"},
26
+ # {"name" => "points", "type" => "int"},
27
+ # {"name"=> "winner", "type"=> "boolean", "default"=> "false"}
28
+ # ]
29
+ # }
30
+ #
31
+ # df = Daru::DataFrame.new(
32
+ # [
33
+ # {"name"=> "Dany", "points"=> 100, "winner"=> true},
34
+ # {"name"=> "Jon", "points"=> 100, "winner"=> true},
35
+ # {"name"=> "Tyrion", "points"=> 100, "winner"=> true}
36
+ # ]
37
+ # )
38
+ #
39
+ # #=> #<Daru::DataFrame(3x3)>
40
+ # # name points winner
41
+ # # 0 Dany 100 true
42
+ # # 1 Jon 100 true
43
+ # # 2 Tyrion 100 true
44
+ #
45
+ # instance = Daru::IO::Exporters::Avro.new(df, schema)
46
+ def initialize(dataframe, schema=nil)
47
+ optional_gem 'avro'
48
+ require 'json'
49
+
50
+ super(dataframe)
51
+ @schema = schema
52
+ end
53
+
54
+ # Exports an Avro Exporter instance to a file-writable String.
55
+ #
56
+ # @return [String] A file-writable string
57
+ #
58
+ # @example Getting a file-writable string from Avro Exporter instance
59
+ # instance.to_s
60
+ #
61
+ # #=> "Obj\u0001\u0004\u0014avro.codec\bnull\u0016avro.schema\xBC\u0002{\"type\":\"record\"..."
62
+ def to_s
63
+ super
64
+ end
65
+
66
+ # Exports an Avro Exporter instance to an avro file.
67
+ #
68
+ # @param path [String] Path of Avro file where the dataframe is to be saved
69
+ #
70
+ # @example Writing an Avro Exporter instance to an Avro file
71
+ # instance.write('azor_ahai.avro')
72
+ def write(path)
73
+ @schema_obj = process_schema
74
+ @writer = ::Avro::IO::DatumWriter.new(@schema_obj)
75
+ @buffer = StringIO.new
76
+ @writer = ::Avro::DataFile::Writer.new(@buffer, @writer, @schema_obj)
77
+ @dataframe.each_row { |row| @writer << row.to_h }
78
+ @writer.close
79
+
80
+ File.open(path, 'w') { |file| file.write(@buffer.string) }
81
+ end
82
+
83
+ private
84
+
85
+ def process_schema
86
+ case @schema
87
+ when ::Avro::Schema then @schema
88
+ when String then ::Avro::Schema.parse(@schema)
89
+ when Hash then ::Avro::Schema.parse(@schema.to_json)
90
+ else raise ArgumentError, 'Invalid Avro Schema provided.'
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,54 @@
1
+ require 'daru/io/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # Base Exporter Class that contains generic helper methods, to be
7
+ # used by other Exporters via inheritence
8
+ class Base < Daru::IO::Base
9
+ # Checks whether the first argument given to any `Daru::IO::<Exporter>` module
10
+ # is a `Daru::DataFrame`. Raises an error when it's not a `Daru::DataFrame`.
11
+ #
12
+ # @param dataframe [Daru::DataFrame] A DataFrame to initialize
13
+ #
14
+ # @example Stores the dataframe
15
+ # df = Daru::DataFrame.new([[1,2],[3,4]], order: [:a, :b])
16
+ # Daru::IO::Exporters::Base.new(df)
17
+ #
18
+ # #=> #<Daru::IO::Exporters::Base:0x007f899081af08 @dataframe=#<Daru::DataFrame(2x2)>
19
+ # # a b
20
+ # # 0 1 3
21
+ # # 1 2 4>
22
+ #
23
+ # @example Raises error when not a DataFrame
24
+ # Daru::IO::Exporters::Base.new(nil)
25
+ #
26
+ # #=> ArgumentError: Expected first argument to be a Daru::DataFrame, received NilClass instead
27
+ def initialize(dataframe)
28
+ unless dataframe.is_a?(Daru::DataFrame)
29
+ raise ArgumentError,
30
+ 'Expected first argument to be a Daru::DataFrame, '\
31
+ "received #{dataframe.class} instead."
32
+ end
33
+ @dataframe = dataframe
34
+ end
35
+
36
+ # Exports an Exporter instance to a file-writable String.
37
+ #
38
+ # @return A file-writable `String`
39
+ #
40
+ # @example Getting a file-writable string from Avro Exporter instance
41
+ #
42
+ # instance = Daru::IO::Exporters::Format.new(opts)
43
+ # instance.to_s #! same as df.to_format_string(opts)
44
+ def to_s
45
+ tempfile = Tempfile.new('tempfile')
46
+ path = tempfile.path
47
+ write(path)
48
+
49
+ File.read(path)
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,103 @@
1
+ require 'daru/io/exporters/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # CSV Exporter Class, that extends `to_csv_string` and `write_csv` methods to
7
+ # `Daru::DataFrame` instance variables
8
+ class CSV < Base
9
+ Daru::DataFrame.register_io_module :to_csv_string, self
10
+ Daru::DataFrame.register_io_module :write_csv, self
11
+
12
+ # Initializes a CSV Exporter instance
13
+ #
14
+ # @param dataframe [Daru::DataFrame] A dataframe to export
15
+ # @param converters [Symbol] A type to convert the data in dataframe
16
+ # @param compression [Symbol] Defaults to `:infer`, which decides depending on file format
17
+ # like `.csv.gz`. For explicitly writing into a `.csv.gz` file, set
18
+ # `:compression` as `:gzip`.
19
+ # @param headers [Boolean] When set to `false`, the headers aren't written
20
+ # to the CSV file
21
+ # @param convert_comma [Boolean] When set to `true`, the decimal delimiter
22
+ # for float values is a comma (,) rather than a dot (.).
23
+ # @param options [Hash] CSV standard library options, to tweak other
24
+ # default options of CSV gem.
25
+ #
26
+ # @example Initializing a CSV Exporter Instance
27
+ # df = Daru::DataFrame.new([[1,2],[3,4]], order: [:a, :b])
28
+ #
29
+ # #=> #<Daru::DataFrame(2x2)>
30
+ # # a b
31
+ # # 0 1 3
32
+ # # 1 2 4
33
+ #
34
+ # csv_instance = Daru::IO::Exporters::CSV.new(df, col_sep: ' ')
35
+ # csv_gz_instance = Daru::IO::Exporters::CSV.new(df, col_sep: ' ', compression: :gzip)
36
+ def initialize(dataframe, converters: :numeric, compression: :infer,
37
+ headers: nil, convert_comma: nil, **options)
38
+ require 'csv'
39
+
40
+ super(dataframe)
41
+ @headers = headers
42
+ @compression = compression
43
+ @convert_comma = convert_comma
44
+ @options = options.merge converters: converters
45
+ end
46
+
47
+ # Exports a CSV Exporter instance to a file-writable String.
48
+ #
49
+ # @return [String] A file-writable string
50
+ #
51
+ # @example Getting a file-writable string from CSV Exporter instance
52
+ # csv_instance.to_s
53
+ # #=> "a b\n1 3\n2 4\n"
54
+ #
55
+ # csv_gz_instance.to_s
56
+ # #=> "\u001F\x8B\b\u0000*D\xA4Y\u0000\u0003KTH\xE22T0\xE62R0\xE1\u0002\u0000\xF2\\\x96y\..."
57
+ def to_s
58
+ super
59
+ end
60
+
61
+ # Exports an Avro Exporter instance to a csv / csv.gz file.
62
+ #
63
+ # @param path [String] Path of the csv / csv.gz file where the dataframe is to be saved
64
+ #
65
+ # @example Writing an Avro Exporter instance to an Avro file
66
+ # csv_instance.write('filename.csv')
67
+ # csv_gz_instance.write('filename.csv.gz')
68
+ def write(path)
69
+ @path = path
70
+ contents = process_dataframe
71
+
72
+ if compression?(:gzip, '.csv.gz')
73
+ require 'zlib'
74
+ ::Zlib::GzipWriter.open(@path) do |gz|
75
+ contents.each { |content| gz.write(content.to_csv(@options)) }
76
+ gz.close
77
+ end
78
+ else
79
+ csv = ::CSV.open(@path, 'w', @options)
80
+ contents.each { |content| csv << content }
81
+ csv.close
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def compression?(algorithm, *formats)
88
+ @compression == algorithm || formats.any? { |f| @path.end_with?(f) }
89
+ end
90
+
91
+ def process_dataframe
92
+ [].tap do |result|
93
+ result << @dataframe.vectors.to_a unless @headers == false
94
+ @dataframe.map_rows do |row|
95
+ next result << row.to_a unless @convert_comma
96
+ result << row.map(&:to_s).map { |v| v =~ /^\d+./ ? v.tr('.',',') : v }
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,148 @@
1
+ require 'daru/io/exporters/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # Excel Exporter Class, that extends `to_excel_string` and `write_excel` methods to
7
+ # `Daru::DataFrame` instance variables
8
+ class Excel < Base
9
+ Daru::DataFrame.register_io_module :to_excel_string, self
10
+ Daru::DataFrame.register_io_module :write_excel, self
11
+
12
+ # Initializes an Excel Exporter instance.
13
+ #
14
+ # @note For giving formatting options as hashes to the `:data`, `:index` or `header`
15
+ # keyword argument(s), please have a look at the
16
+ # {http://www.rubydoc.info/gems/ruby-spreadsheet/Spreadsheet/Font Spreadsheet::Font}
17
+ # and
18
+ # {http://www.rubydoc.info/gems/ruby-spreadsheet/Spreadsheet/Format Spreadsheet::Format}
19
+ # pages.
20
+ #
21
+ # @param dataframe [Daru::DataFrame] A dataframe to export. Supports even dataframes
22
+ # with multi-index.
23
+ # @param header [Hash or Boolean] Defaults to true. When set to false or nil,
24
+ # headers are not written. When given a hash of formatting options,
25
+ # headers are written with the specific formatting. When set to true,
26
+ # headers are written without any formatting.
27
+ # @param data [Hash or Boolean] Defaults to true. When set to false or nil,
28
+ # data values are not written. When given a hash of formatting options,
29
+ # data values are written with the specific formatting. When set to true,
30
+ # data values are written without any formatting.
31
+ # @param index [Hash or Boolean] Defaults to true. When set to false or nil,
32
+ # index values are not written. When given a hash of formatting options,
33
+ # index values are written with the specific formatting. When set to true,
34
+ # index values are written without any formatting.
35
+ #
36
+ # @example Initializing an Excel Exporter instance
37
+ # df = Daru::DataFrame.new([[1,2],[3,4]], order: [:a, :b])
38
+ #
39
+ # #=> #<Daru::DataFrame(2x2)>
40
+ # # a b
41
+ # # 0 1 3
42
+ # # 1 2 4
43
+ #
44
+ # simple_instance = Daru::IO::Exporters::Excel.new(df)
45
+ # formatted_instance = Daru::IO::Exporters::Excel.new(
46
+ # df,
47
+ # header: { color: :red, weight: :bold },
48
+ # index: false,
49
+ # data: { color: :blue }
50
+ # )
51
+ def initialize(dataframe, header: true, data: true, index: true)
52
+ optional_gem 'spreadsheet', '~> 1.1.1'
53
+
54
+ super(dataframe)
55
+ @data = data
56
+ @index = index
57
+ @header = header
58
+ end
59
+
60
+ # Exports an Excel Exporter instance to a file-writable String.
61
+ #
62
+ # @return [String] A file-writable string
63
+ #
64
+ # @example Getting a file-writable string from Excel Exporter instance
65
+ # simple_instance.to_s #! same as df.to_avro_string(schema)
66
+ #
67
+ # #=> "\xD0\xCF\u0011\u0871\u001A\xE1\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000..."
68
+ #
69
+ # formatted_instance.to_s
70
+ #
71
+ # #=> "\xD0\xCF\u0011\u0871\u001A\xE1\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000..."
72
+ def to_s
73
+ super
74
+ end
75
+
76
+ # Exports an Excel Exporter instance to an xls file.
77
+ #
78
+ # @param path [String] Path of excel file where the dataframe is to be saved
79
+ #
80
+ # @example Writing an Excel Exporter instance to an xls file
81
+ # instance.write('filename.xls')
82
+ def write(path)
83
+ @book = Spreadsheet::Workbook.new
84
+ @sheet = @book.create_worksheet
85
+
86
+ process_offsets
87
+ write_headers
88
+
89
+ @dataframe.each_row_with_index.with_index do |(row, idx), r|
90
+ write_index(idx, r+@row_offset)
91
+ write_data(row, r+@row_offset)
92
+ end
93
+
94
+ @book.write(path)
95
+ end
96
+
97
+ private
98
+
99
+ def process_offsets
100
+ @row_offset = @header ? 1 : 0
101
+ @col_offset = 0 unless @index
102
+ @col_offset ||= @dataframe.index.is_a?(Daru::MultiIndex) ? @dataframe.index.width : 1
103
+ end
104
+
105
+ def write_headers
106
+ formatting(
107
+ 0...@col_offset + @dataframe.ncols,
108
+ 0,
109
+ [' '] * @col_offset + @dataframe.vectors.map(&:to_s),
110
+ @header
111
+ )
112
+ end
113
+
114
+ def write_index(idx, row)
115
+ formatting(
116
+ 0...@col_offset,
117
+ row,
118
+ idx,
119
+ @index
120
+ )
121
+ end
122
+
123
+ def write_data(row, idx)
124
+ formatting(
125
+ @col_offset...@col_offset + @dataframe.ncols,
126
+ idx,
127
+ row,
128
+ @data
129
+ )
130
+ end
131
+
132
+ def formatting(col_range, row, idx, format)
133
+ return unless format
134
+ @sheet.row(row).concat(
135
+ case idx
136
+ when Daru::Vector then idx.to_a
137
+ when Array then idx.map(&:to_s)
138
+ else [idx.to_s]
139
+ end
140
+ )
141
+
142
+ return unless format.is_a?(Hash)
143
+ col_range.each { |col| @sheet.row(row).set_format(col, Spreadsheet::Format.new(format)) }
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end