daru-io 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (44) hide show
  1. checksums.yaml +7 -0
  2. data/.gitignore +11 -0
  3. data/.rspec +2 -0
  4. data/.rspec_formatter.rb +24 -0
  5. data/.rubocop.yml +109 -0
  6. data/.travis.yml +30 -0
  7. data/.yardopts +2 -0
  8. data/CODE_OF_CONDUCT.md +46 -0
  9. data/CONTRIBUTING.md +65 -0
  10. data/Gemfile +20 -0
  11. data/Guardfile +7 -0
  12. data/LICENSE.md +21 -0
  13. data/README.md +654 -0
  14. data/Rakefile +12 -0
  15. data/daru-io.gemspec +39 -0
  16. data/lib/daru/io.rb +3 -0
  17. data/lib/daru/io/base.rb +45 -0
  18. data/lib/daru/io/exporters.rb +1 -0
  19. data/lib/daru/io/exporters/avro.rb +96 -0
  20. data/lib/daru/io/exporters/base.rb +54 -0
  21. data/lib/daru/io/exporters/csv.rb +103 -0
  22. data/lib/daru/io/exporters/excel.rb +148 -0
  23. data/lib/daru/io/exporters/json.rb +570 -0
  24. data/lib/daru/io/exporters/r_data.rb +66 -0
  25. data/lib/daru/io/exporters/rds.rb +79 -0
  26. data/lib/daru/io/exporters/sql.rb +55 -0
  27. data/lib/daru/io/importers.rb +1 -0
  28. data/lib/daru/io/importers/active_record.rb +75 -0
  29. data/lib/daru/io/importers/avro.rb +54 -0
  30. data/lib/daru/io/importers/base.rb +62 -0
  31. data/lib/daru/io/importers/csv.rb +190 -0
  32. data/lib/daru/io/importers/excel.rb +99 -0
  33. data/lib/daru/io/importers/excelx.rb +138 -0
  34. data/lib/daru/io/importers/html.rb +144 -0
  35. data/lib/daru/io/importers/json.rb +152 -0
  36. data/lib/daru/io/importers/mongo.rb +139 -0
  37. data/lib/daru/io/importers/plaintext.rb +97 -0
  38. data/lib/daru/io/importers/r_data.rb +74 -0
  39. data/lib/daru/io/importers/rds.rb +67 -0
  40. data/lib/daru/io/importers/redis.rb +135 -0
  41. data/lib/daru/io/importers/sql.rb +127 -0
  42. data/lib/daru/io/link.rb +80 -0
  43. data/lib/daru/io/version.rb +5 -0
  44. metadata +269 -0
@@ -0,0 +1,12 @@
1
+ require 'bundler/setup'
2
+
3
+ require 'rubygems/tasks'
4
+ Gem::Tasks.new
5
+
6
+ require 'rspec/core/rake_task'
7
+ RSpec::Core::RakeTask.new
8
+
9
+ require 'rubocop/rake_task'
10
+ RuboCop::RakeTask.new
11
+
12
+ task default: %w[spec rubocop]
@@ -0,0 +1,39 @@
1
+ # coding: utf-8
2
+
3
+ lib = File.expand_path('../lib', __FILE__)
4
+ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
5
+ require 'daru/io/version'
6
+
7
+ Daru::IO::DESCRIPTION = <<MSG.freeze
8
+ Daru-IO is a plugin-gem to Daru gem, which stands for Data Analysis in RUby. Daru-IO extends support for many Import and Export methods of Daru::DataFrame. This gem is intended to help Rubyists who are into Data Analysis or Web Development, by serving as a general purpose conversion library, while also making it incredibly easy to getting started on analyzing data with daru.
9
+ MSG
10
+
11
+ Gem::Specification.new do |spec|
12
+ spec.name = 'daru-io'
13
+ spec.version = Daru::IO::VERSION
14
+ spec.authors = ['Athitya Kumar']
15
+ spec.email = ['athityakumar@gmail.com']
16
+ spec.summary = 'Daru-IO is a plugin-gem to Daru gem, which stands for Data Analysis in RUby.'
17
+ spec.description = Daru::IO::DESCRIPTION
18
+ spec.homepage = 'https://github.com/athityakumar/daru-io'
19
+ spec.license = 'MIT'
20
+ spec.files = `git ls-files -z`.split("\x0").reject { |f| f.match(%r{^(test|spec|features)/}) }
21
+ spec.bindir = 'bin'
22
+ spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
23
+ spec.require_paths = ['lib']
24
+
25
+ spec.add_runtime_dependency 'daru', '~> 0.1.5'
26
+
27
+ spec.add_development_dependency 'bundler', '~> 1.15'
28
+ spec.add_development_dependency 'rake', '~> 10.0'
29
+ spec.add_development_dependency 'redcarpet'
30
+ spec.add_development_dependency 'rspec', '~> 3.0'
31
+ spec.add_development_dependency 'rspec-its'
32
+ spec.add_development_dependency 'rubocop', '>= 0.40.0'
33
+ spec.add_development_dependency 'rubocop-rspec'
34
+ spec.add_development_dependency 'rubygems-tasks'
35
+ spec.add_development_dependency 'simplecov'
36
+ spec.add_development_dependency 'webmock'
37
+ spec.add_development_dependency 'yard'
38
+ spec.add_development_dependency 'guard-rspec' if RUBY_VERSION >= '2.2.5'
39
+ end
@@ -0,0 +1,3 @@
1
+ require 'daru/io/version'
2
+ require 'daru/io/importers'
3
+ require 'daru/io/exporters'
@@ -0,0 +1,45 @@
1
+ require 'daru'
2
+ require 'daru/io/link'
3
+
4
+ module Daru
5
+ module IO
6
+ # Base IO Class that contains generic helper methods, to be
7
+ # used by other {Importers::Base} and {Exporters::Base} via inheritence
8
+ class Base
9
+ # Specifies and requires a gem, if the gem is present in the application
10
+ # environment. Else, raises `LoadError` with meaningful message of which
11
+ # dependency to install for which Daru-IO module.
12
+ #
13
+ # @param dependency [String] A dependency to specify with `gem` command
14
+ # @param version [String] A version range to specify with `gem` command
15
+ # @param requires [String] The gem name to be required, in case it's
16
+ # different from the dependency name. For example, activerecord
17
+ # dependency has to be required as `require 'active_record'`
18
+ # @param callback [Class] The Daru-IO module which is being used currently.
19
+ # Useful for throwing meaningful `LoadError` message.
20
+ #
21
+ # @example Requires with dependency
22
+ # optional_gem 'avro'
23
+ # #=> true
24
+ #
25
+ # @example Requires with version and requires
26
+ # optional_gem 'activerecord', '~> 4.0', requires: 'active_record'
27
+ # #=> true
28
+ #
29
+ # @example Raises error with meaningful message
30
+ # df = Daru::DataFrame.from_json('path/to/file.json')
31
+ # #=> LoadError: Please install the jsonpath gem, or add it to the
32
+ # # Gemfile to use the Daru::IO::Importers::JSON module.
33
+ def optional_gem(dependency, version=nil, requires: nil,
34
+ callback: self.class.name)
35
+ gem dependency, version
36
+ require requires || dependency
37
+ rescue LoadError
38
+ version = version.nil? ? '' : " #{version} version"
39
+ raise LoadError,
40
+ "Please install the #{dependency} gem#{version}, "\
41
+ "or add it to the Gemfile to use the #{callback} module."
42
+ end
43
+ end
44
+ end
45
+ end
@@ -0,0 +1 @@
1
+ Dir["#{__dir__}/exporters/*.rb"].each { |file| require "daru/io#{file.gsub(__dir__, '')}" }
@@ -0,0 +1,96 @@
1
+ require 'daru/io/exporters/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # Avro Exporter Class, that extends `to_avro_string` and `write_avro` methods to
7
+ # `Daru::DataFrame` instance variables
8
+ class Avro < Base
9
+ Daru::DataFrame.register_io_module :to_avro_string, self
10
+ Daru::DataFrame.register_io_module :write_avro, self
11
+
12
+ # Initializes an Avro Exporter instance.
13
+ #
14
+ # @param dataframe [Daru::DataFrame] A dataframe to export
15
+ # @param schema [Avro::Schema or Hash] The schema should contain details such as `:type`,
16
+ # `:name` and `:fields`
17
+ #
18
+ # @return A `Daru::IO::Exporter::Avro` instance
19
+ #
20
+ # @example Initializing an Avro Exporter
21
+ # schema = {
22
+ # "type" => "record",
23
+ # "name" => "User",
24
+ # "fields" => [
25
+ # {"name" => "name", "type" => "string"},
26
+ # {"name" => "points", "type" => "int"},
27
+ # {"name"=> "winner", "type"=> "boolean", "default"=> "false"}
28
+ # ]
29
+ # }
30
+ #
31
+ # df = Daru::DataFrame.new(
32
+ # [
33
+ # {"name"=> "Dany", "points"=> 100, "winner"=> true},
34
+ # {"name"=> "Jon", "points"=> 100, "winner"=> true},
35
+ # {"name"=> "Tyrion", "points"=> 100, "winner"=> true}
36
+ # ]
37
+ # )
38
+ #
39
+ # #=> #<Daru::DataFrame(3x3)>
40
+ # # name points winner
41
+ # # 0 Dany 100 true
42
+ # # 1 Jon 100 true
43
+ # # 2 Tyrion 100 true
44
+ #
45
+ # instance = Daru::IO::Exporters::Avro.new(df, schema)
46
+ def initialize(dataframe, schema=nil)
47
+ optional_gem 'avro'
48
+ require 'json'
49
+
50
+ super(dataframe)
51
+ @schema = schema
52
+ end
53
+
54
+ # Exports an Avro Exporter instance to a file-writable String.
55
+ #
56
+ # @return [String] A file-writable string
57
+ #
58
+ # @example Getting a file-writable string from Avro Exporter instance
59
+ # instance.to_s
60
+ #
61
+ # #=> "Obj\u0001\u0004\u0014avro.codec\bnull\u0016avro.schema\xBC\u0002{\"type\":\"record\"..."
62
+ def to_s
63
+ super
64
+ end
65
+
66
+ # Exports an Avro Exporter instance to an avro file.
67
+ #
68
+ # @param path [String] Path of Avro file where the dataframe is to be saved
69
+ #
70
+ # @example Writing an Avro Exporter instance to an Avro file
71
+ # instance.write('azor_ahai.avro')
72
+ def write(path)
73
+ @schema_obj = process_schema
74
+ @writer = ::Avro::IO::DatumWriter.new(@schema_obj)
75
+ @buffer = StringIO.new
76
+ @writer = ::Avro::DataFile::Writer.new(@buffer, @writer, @schema_obj)
77
+ @dataframe.each_row { |row| @writer << row.to_h }
78
+ @writer.close
79
+
80
+ File.open(path, 'w') { |file| file.write(@buffer.string) }
81
+ end
82
+
83
+ private
84
+
85
+ def process_schema
86
+ case @schema
87
+ when ::Avro::Schema then @schema
88
+ when String then ::Avro::Schema.parse(@schema)
89
+ when Hash then ::Avro::Schema.parse(@schema.to_json)
90
+ else raise ArgumentError, 'Invalid Avro Schema provided.'
91
+ end
92
+ end
93
+ end
94
+ end
95
+ end
96
+ end
@@ -0,0 +1,54 @@
1
+ require 'daru/io/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # Base Exporter Class that contains generic helper methods, to be
7
+ # used by other Exporters via inheritence
8
+ class Base < Daru::IO::Base
9
+ # Checks whether the first argument given to any `Daru::IO::<Exporter>` module
10
+ # is a `Daru::DataFrame`. Raises an error when it's not a `Daru::DataFrame`.
11
+ #
12
+ # @param dataframe [Daru::DataFrame] A DataFrame to initialize
13
+ #
14
+ # @example Stores the dataframe
15
+ # df = Daru::DataFrame.new([[1,2],[3,4]], order: [:a, :b])
16
+ # Daru::IO::Exporters::Base.new(df)
17
+ #
18
+ # #=> #<Daru::IO::Exporters::Base:0x007f899081af08 @dataframe=#<Daru::DataFrame(2x2)>
19
+ # # a b
20
+ # # 0 1 3
21
+ # # 1 2 4>
22
+ #
23
+ # @example Raises error when not a DataFrame
24
+ # Daru::IO::Exporters::Base.new(nil)
25
+ #
26
+ # #=> ArgumentError: Expected first argument to be a Daru::DataFrame, received NilClass instead
27
+ def initialize(dataframe)
28
+ unless dataframe.is_a?(Daru::DataFrame)
29
+ raise ArgumentError,
30
+ 'Expected first argument to be a Daru::DataFrame, '\
31
+ "received #{dataframe.class} instead."
32
+ end
33
+ @dataframe = dataframe
34
+ end
35
+
36
+ # Exports an Exporter instance to a file-writable String.
37
+ #
38
+ # @return A file-writable `String`
39
+ #
40
+ # @example Getting a file-writable string from Avro Exporter instance
41
+ #
42
+ # instance = Daru::IO::Exporters::Format.new(opts)
43
+ # instance.to_s #! same as df.to_format_string(opts)
44
+ def to_s
45
+ tempfile = Tempfile.new('tempfile')
46
+ path = tempfile.path
47
+ write(path)
48
+
49
+ File.read(path)
50
+ end
51
+ end
52
+ end
53
+ end
54
+ end
@@ -0,0 +1,103 @@
1
+ require 'daru/io/exporters/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # CSV Exporter Class, that extends `to_csv_string` and `write_csv` methods to
7
+ # `Daru::DataFrame` instance variables
8
+ class CSV < Base
9
+ Daru::DataFrame.register_io_module :to_csv_string, self
10
+ Daru::DataFrame.register_io_module :write_csv, self
11
+
12
+ # Initializes a CSV Exporter instance
13
+ #
14
+ # @param dataframe [Daru::DataFrame] A dataframe to export
15
+ # @param converters [Symbol] A type to convert the data in dataframe
16
+ # @param compression [Symbol] Defaults to `:infer`, which decides depending on file format
17
+ # like `.csv.gz`. For explicitly writing into a `.csv.gz` file, set
18
+ # `:compression` as `:gzip`.
19
+ # @param headers [Boolean] When set to `false`, the headers aren't written
20
+ # to the CSV file
21
+ # @param convert_comma [Boolean] When set to `true`, the decimal delimiter
22
+ # for float values is a comma (,) rather than a dot (.).
23
+ # @param options [Hash] CSV standard library options, to tweak other
24
+ # default options of CSV gem.
25
+ #
26
+ # @example Initializing a CSV Exporter Instance
27
+ # df = Daru::DataFrame.new([[1,2],[3,4]], order: [:a, :b])
28
+ #
29
+ # #=> #<Daru::DataFrame(2x2)>
30
+ # # a b
31
+ # # 0 1 3
32
+ # # 1 2 4
33
+ #
34
+ # csv_instance = Daru::IO::Exporters::CSV.new(df, col_sep: ' ')
35
+ # csv_gz_instance = Daru::IO::Exporters::CSV.new(df, col_sep: ' ', compression: :gzip)
36
+ def initialize(dataframe, converters: :numeric, compression: :infer,
37
+ headers: nil, convert_comma: nil, **options)
38
+ require 'csv'
39
+
40
+ super(dataframe)
41
+ @headers = headers
42
+ @compression = compression
43
+ @convert_comma = convert_comma
44
+ @options = options.merge converters: converters
45
+ end
46
+
47
+ # Exports a CSV Exporter instance to a file-writable String.
48
+ #
49
+ # @return [String] A file-writable string
50
+ #
51
+ # @example Getting a file-writable string from CSV Exporter instance
52
+ # csv_instance.to_s
53
+ # #=> "a b\n1 3\n2 4\n"
54
+ #
55
+ # csv_gz_instance.to_s
56
+ # #=> "\u001F\x8B\b\u0000*D\xA4Y\u0000\u0003KTH\xE22T0\xE62R0\xE1\u0002\u0000\xF2\\\x96y\..."
57
+ def to_s
58
+ super
59
+ end
60
+
61
+ # Exports an Avro Exporter instance to a csv / csv.gz file.
62
+ #
63
+ # @param path [String] Path of the csv / csv.gz file where the dataframe is to be saved
64
+ #
65
+ # @example Writing an Avro Exporter instance to an Avro file
66
+ # csv_instance.write('filename.csv')
67
+ # csv_gz_instance.write('filename.csv.gz')
68
+ def write(path)
69
+ @path = path
70
+ contents = process_dataframe
71
+
72
+ if compression?(:gzip, '.csv.gz')
73
+ require 'zlib'
74
+ ::Zlib::GzipWriter.open(@path) do |gz|
75
+ contents.each { |content| gz.write(content.to_csv(@options)) }
76
+ gz.close
77
+ end
78
+ else
79
+ csv = ::CSV.open(@path, 'w', @options)
80
+ contents.each { |content| csv << content }
81
+ csv.close
82
+ end
83
+ end
84
+
85
+ private
86
+
87
+ def compression?(algorithm, *formats)
88
+ @compression == algorithm || formats.any? { |f| @path.end_with?(f) }
89
+ end
90
+
91
+ def process_dataframe
92
+ [].tap do |result|
93
+ result << @dataframe.vectors.to_a unless @headers == false
94
+ @dataframe.map_rows do |row|
95
+ next result << row.to_a unless @convert_comma
96
+ result << row.map(&:to_s).map { |v| v =~ /^\d+./ ? v.tr('.',',') : v }
97
+ end
98
+ end
99
+ end
100
+ end
101
+ end
102
+ end
103
+ end
@@ -0,0 +1,148 @@
1
+ require 'daru/io/exporters/base'
2
+
3
+ module Daru
4
+ module IO
5
+ module Exporters
6
+ # Excel Exporter Class, that extends `to_excel_string` and `write_excel` methods to
7
+ # `Daru::DataFrame` instance variables
8
+ class Excel < Base
9
+ Daru::DataFrame.register_io_module :to_excel_string, self
10
+ Daru::DataFrame.register_io_module :write_excel, self
11
+
12
+ # Initializes an Excel Exporter instance.
13
+ #
14
+ # @note For giving formatting options as hashes to the `:data`, `:index` or `header`
15
+ # keyword argument(s), please have a look at the
16
+ # {http://www.rubydoc.info/gems/ruby-spreadsheet/Spreadsheet/Font Spreadsheet::Font}
17
+ # and
18
+ # {http://www.rubydoc.info/gems/ruby-spreadsheet/Spreadsheet/Format Spreadsheet::Format}
19
+ # pages.
20
+ #
21
+ # @param dataframe [Daru::DataFrame] A dataframe to export. Supports even dataframes
22
+ # with multi-index.
23
+ # @param header [Hash or Boolean] Defaults to true. When set to false or nil,
24
+ # headers are not written. When given a hash of formatting options,
25
+ # headers are written with the specific formatting. When set to true,
26
+ # headers are written without any formatting.
27
+ # @param data [Hash or Boolean] Defaults to true. When set to false or nil,
28
+ # data values are not written. When given a hash of formatting options,
29
+ # data values are written with the specific formatting. When set to true,
30
+ # data values are written without any formatting.
31
+ # @param index [Hash or Boolean] Defaults to true. When set to false or nil,
32
+ # index values are not written. When given a hash of formatting options,
33
+ # index values are written with the specific formatting. When set to true,
34
+ # index values are written without any formatting.
35
+ #
36
+ # @example Initializing an Excel Exporter instance
37
+ # df = Daru::DataFrame.new([[1,2],[3,4]], order: [:a, :b])
38
+ #
39
+ # #=> #<Daru::DataFrame(2x2)>
40
+ # # a b
41
+ # # 0 1 3
42
+ # # 1 2 4
43
+ #
44
+ # simple_instance = Daru::IO::Exporters::Excel.new(df)
45
+ # formatted_instance = Daru::IO::Exporters::Excel.new(
46
+ # df,
47
+ # header: { color: :red, weight: :bold },
48
+ # index: false,
49
+ # data: { color: :blue }
50
+ # )
51
+ def initialize(dataframe, header: true, data: true, index: true)
52
+ optional_gem 'spreadsheet', '~> 1.1.1'
53
+
54
+ super(dataframe)
55
+ @data = data
56
+ @index = index
57
+ @header = header
58
+ end
59
+
60
+ # Exports an Excel Exporter instance to a file-writable String.
61
+ #
62
+ # @return [String] A file-writable string
63
+ #
64
+ # @example Getting a file-writable string from Excel Exporter instance
65
+ # simple_instance.to_s #! same as df.to_avro_string(schema)
66
+ #
67
+ # #=> "\xD0\xCF\u0011\u0871\u001A\xE1\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000..."
68
+ #
69
+ # formatted_instance.to_s
70
+ #
71
+ # #=> "\xD0\xCF\u0011\u0871\u001A\xE1\u0000\u0000\u0000\u0000\u0000\u0000\u0000\u0000..."
72
+ def to_s
73
+ super
74
+ end
75
+
76
+ # Exports an Excel Exporter instance to an xls file.
77
+ #
78
+ # @param path [String] Path of excel file where the dataframe is to be saved
79
+ #
80
+ # @example Writing an Excel Exporter instance to an xls file
81
+ # instance.write('filename.xls')
82
+ def write(path)
83
+ @book = Spreadsheet::Workbook.new
84
+ @sheet = @book.create_worksheet
85
+
86
+ process_offsets
87
+ write_headers
88
+
89
+ @dataframe.each_row_with_index.with_index do |(row, idx), r|
90
+ write_index(idx, r+@row_offset)
91
+ write_data(row, r+@row_offset)
92
+ end
93
+
94
+ @book.write(path)
95
+ end
96
+
97
+ private
98
+
99
+ def process_offsets
100
+ @row_offset = @header ? 1 : 0
101
+ @col_offset = 0 unless @index
102
+ @col_offset ||= @dataframe.index.is_a?(Daru::MultiIndex) ? @dataframe.index.width : 1
103
+ end
104
+
105
+ def write_headers
106
+ formatting(
107
+ 0...@col_offset + @dataframe.ncols,
108
+ 0,
109
+ [' '] * @col_offset + @dataframe.vectors.map(&:to_s),
110
+ @header
111
+ )
112
+ end
113
+
114
+ def write_index(idx, row)
115
+ formatting(
116
+ 0...@col_offset,
117
+ row,
118
+ idx,
119
+ @index
120
+ )
121
+ end
122
+
123
+ def write_data(row, idx)
124
+ formatting(
125
+ @col_offset...@col_offset + @dataframe.ncols,
126
+ idx,
127
+ row,
128
+ @data
129
+ )
130
+ end
131
+
132
+ def formatting(col_range, row, idx, format)
133
+ return unless format
134
+ @sheet.row(row).concat(
135
+ case idx
136
+ when Daru::Vector then idx.to_a
137
+ when Array then idx.map(&:to_s)
138
+ else [idx.to_s]
139
+ end
140
+ )
141
+
142
+ return unless format.is_a?(Hash)
143
+ col_range.each { |col| @sheet.row(row).set_format(col, Spreadsheet::Format.new(format)) }
144
+ end
145
+ end
146
+ end
147
+ end
148
+ end