daru_lite 0.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
  3. data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  4. data/.github/workflows/ci.yml +20 -0
  5. data/.rubocop_todo.yml +35 -33
  6. data/README.md +19 -115
  7. data/daru_lite.gemspec +1 -0
  8. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  9. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  10. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  11. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  12. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  13. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  14. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  15. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  16. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  17. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  18. data/lib/daru_lite/data_frame/missable.rb +75 -0
  19. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  20. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  21. data/lib/daru_lite/data_frame/setable.rb +109 -0
  22. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  23. data/lib/daru_lite/dataframe.rb +142 -2355
  24. data/lib/daru_lite/index/index.rb +13 -0
  25. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  26. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  27. data/lib/daru_lite/vector/calculatable.rb +78 -0
  28. data/lib/daru_lite/vector/convertible.rb +77 -0
  29. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  30. data/lib/daru_lite/vector/fetchable.rb +175 -0
  31. data/lib/daru_lite/vector/filterable.rb +128 -0
  32. data/lib/daru_lite/vector/indexable.rb +77 -0
  33. data/lib/daru_lite/vector/iterable.rb +95 -0
  34. data/lib/daru_lite/vector/joinable.rb +17 -0
  35. data/lib/daru_lite/vector/missable.rb +124 -0
  36. data/lib/daru_lite/vector/queryable.rb +45 -0
  37. data/lib/daru_lite/vector/setable.rb +47 -0
  38. data/lib/daru_lite/vector/sortable.rb +113 -0
  39. data/lib/daru_lite/vector.rb +36 -932
  40. data/lib/daru_lite/version.rb +1 -1
  41. data/spec/data_frame/aggregatable_example.rb +65 -0
  42. data/spec/data_frame/buildable_example.rb +109 -0
  43. data/spec/data_frame/calculatable_example.rb +135 -0
  44. data/spec/data_frame/convertible_example.rb +180 -0
  45. data/spec/data_frame/duplicatable_example.rb +111 -0
  46. data/spec/data_frame/fetchable_example.rb +476 -0
  47. data/spec/data_frame/filterable_example.rb +250 -0
  48. data/spec/data_frame/indexable_example.rb +221 -0
  49. data/spec/data_frame/iterable_example.rb +465 -0
  50. data/spec/data_frame/joinable_example.rb +106 -0
  51. data/spec/data_frame/missable_example.rb +47 -0
  52. data/spec/data_frame/pivotable_example.rb +297 -0
  53. data/spec/data_frame/queryable_example.rb +92 -0
  54. data/spec/data_frame/setable_example.rb +482 -0
  55. data/spec/data_frame/sortable_example.rb +350 -0
  56. data/spec/dataframe_spec.rb +181 -3243
  57. data/spec/index/index_spec.rb +8 -0
  58. data/spec/vector/aggregatable_example.rb +27 -0
  59. data/spec/vector/calculatable_example.rb +82 -0
  60. data/spec/vector/convertible_example.rb +126 -0
  61. data/spec/vector/duplicatable_example.rb +48 -0
  62. data/spec/vector/fetchable_example.rb +463 -0
  63. data/spec/vector/filterable_example.rb +165 -0
  64. data/spec/vector/indexable_example.rb +201 -0
  65. data/spec/vector/iterable_example.rb +111 -0
  66. data/spec/vector/joinable_example.rb +25 -0
  67. data/spec/vector/missable_example.rb +88 -0
  68. data/spec/vector/queryable_example.rb +91 -0
  69. data/spec/vector/setable_example.rb +300 -0
  70. data/spec/vector/sortable_example.rb +242 -0
  71. data/spec/vector_spec.rb +111 -1805
  72. metadata +102 -3
  73. data/.github/ISSUE_TEMPLATE.md +0 -18
@@ -0,0 +1,144 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Filterable
4
+ # Return unique rows by vector specified or all vectors
5
+ #
6
+ # @param vtrs [String][Symbol] vector names(s) that should be considered
7
+ #
8
+ # @example
9
+ #
10
+ # => #<DaruLite::DataFrame(6x2)>
11
+ # a b
12
+ # 0 1 a
13
+ # 1 2 b
14
+ # 2 3 c
15
+ # 3 4 d
16
+ # 2 3 c
17
+ # 3 4 f
18
+ #
19
+ # 2.3.3 :> df.uniq
20
+ # => #<DaruLite::DataFrame(5x2)>
21
+ # a b
22
+ # 0 1 a
23
+ # 1 2 b
24
+ # 2 3 c
25
+ # 3 4 d
26
+ # 3 4 f
27
+ #
28
+ # 2.3.3 :> df.uniq(:a)
29
+ # => #<DaruLite::DataFrame(5x2)>
30
+ # a b
31
+ # 0 1 a
32
+ # 1 2 b
33
+ # 2 3 c
34
+ # 3 4 d
35
+ #
36
+ def uniq(*vtrs)
37
+ vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
38
+ grouped = group_by(vecs)
39
+ indexes = grouped.groups.values.map { |v| v[0] }.sort
40
+ row[*indexes]
41
+ end
42
+
43
+ # Retain vectors or rows if the block returns a truthy value.
44
+ #
45
+ # == Description
46
+ #
47
+ # For filtering out certain rows/vectors based on their values,
48
+ # use the #filter method. By default it iterates over vectors and
49
+ # keeps those vectors for which the block returns true. It accepts
50
+ # an optional axis argument which lets you specify whether you want
51
+ # to iterate over vectors or rows.
52
+ #
53
+ # == Arguments
54
+ #
55
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
56
+ # Default to :vector.
57
+ #
58
+ # == Usage
59
+ #
60
+ # # Filter vectors
61
+ #
62
+ # df.filter do |vector|
63
+ # vector.type == :numeric and vector.median < 50
64
+ # end
65
+ #
66
+ # # Filter rows
67
+ #
68
+ # df.filter(:row) do |row|
69
+ # row[:a] + row[:d] < 100
70
+ # end
71
+ def filter(axis = :vector, &block)
72
+ dispatch_to_axis_pl axis, :filter, &block
73
+ end
74
+
75
+ # Returns a dataframe in which rows with any of the mentioned values
76
+ # are ignored.
77
+ # @param [Array] values to reject to form the new dataframe
78
+ # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
79
+ # contain the mentioned values
80
+ # @example
81
+ # df = DaruLite::DataFrame.new({
82
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
83
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
84
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
85
+ # }, index: 11..18)
86
+ # df.reject_values nil, Float::NAN
87
+ # # => #<DaruLite::DataFrame(2x3)>
88
+ # # a b c
89
+ # # 11 1 a a
90
+ # # 18 7 8 7
91
+ def reject_values(*values)
92
+ positions =
93
+ size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
94
+ # Handle the case when positions size is 1 and #row_at wouldn't return a df
95
+ if positions.size == 1
96
+ pos = positions.first
97
+ row_at(pos..pos)
98
+ else
99
+ row_at(*positions)
100
+ end
101
+ end
102
+
103
+ def keep_row_if
104
+ @index.size.times
105
+ .reject { |position| yield(row_at(position)) }
106
+ .reverse_each { |position| delete_at_position(position) }
107
+ end
108
+
109
+ def keep_vector_if
110
+ @vectors.each do |vector|
111
+ delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
112
+ end
113
+ end
114
+
115
+ # creates a new vector with the data of a given field which the block returns true
116
+ def filter_vector(vec, &block)
117
+ DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
118
+ end
119
+
120
+ # Iterates over each row and retains it in a new DataFrame if the block returns
121
+ # true for that row.
122
+ def filter_rows
123
+ return to_enum(:filter_rows) unless block_given?
124
+
125
+ keep_rows = @index.map { |index| yield access_row(index) }
126
+
127
+ where keep_rows
128
+ end
129
+
130
+ # Iterates over each vector and retains it in a new DataFrame if the block returns
131
+ # true for that vector.
132
+ def filter_vectors(&block)
133
+ return to_enum(:filter_vectors) unless block
134
+
135
+ dup.tap { |df| df.keep_vector_if(&block) }
136
+ end
137
+
138
+ # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
139
+ def where(bool_array)
140
+ DaruLite::Core::Query.df_where self, bool_array
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,179 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module IOAble
4
+ module ClassMethods
5
+ # Load data from a CSV file. Specify an optional block to grab the CSV
6
+ # object and pre-condition it (for example use the `convert` or
7
+ # `header_convert` methods).
8
+ #
9
+ # == Arguments
10
+ #
11
+ # * path - Local path / Remote URL of the file to load specified as a String.
12
+ #
13
+ # == Options
14
+ #
15
+ # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
16
+ # and uses those to eventually construct the resulting DataFrame.
17
+ #
18
+ # == Verbose Description
19
+ #
20
+ # You can specify all the options to the `.from_csv` function that you
21
+ # do to the Ruby `CSV.read()` function, since this is what is used internally.
22
+ #
23
+ # For example, if the columns in your CSV file are separated by something
24
+ # other that commas, you can use the `:col_sep` option. If you want to
25
+ # convert numeric values to numbers and not keep them as strings, you can
26
+ # use the `:converters` option and set it to `:numeric`.
27
+ #
28
+ # The `.from_csv` function uses the following defaults for reading CSV files
29
+ # (that are passed into the `CSV.read()` function):
30
+ #
31
+ # {
32
+ # :col_sep => ',',
33
+ # :converters => :numeric
34
+ # }
35
+ def from_csv(path, opts = {}, &block)
36
+ DaruLite::IO.from_csv path, opts, &block
37
+ end
38
+
39
+ # Read data from an Excel file into a DataFrame.
40
+ #
41
+ # == Arguments
42
+ #
43
+ # * path - Path of the file to be read.
44
+ #
45
+ # == Options
46
+ #
47
+ # *:worksheet_id - ID of the worksheet that is to be read.
48
+ def from_excel(path, opts = {}, &block)
49
+ DaruLite::IO.from_excel path, opts, &block
50
+ end
51
+
52
+ # Read a database query and returns a Dataset
53
+ #
54
+ # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
55
+ # @param query [String] The query to be executed
56
+ #
57
+ # @return A dataframe containing the data resulting from the query
58
+ #
59
+ # USE:
60
+ #
61
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
62
+ # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
63
+ #
64
+ # #Alternatively
65
+ #
66
+ # require 'dbi'
67
+ # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
68
+ def from_sql(dbh, query)
69
+ DaruLite::IO.from_sql dbh, query
70
+ end
71
+
72
+ # Read a dataframe from AR::Relation
73
+ #
74
+ # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
75
+ # @param fields [Array] Field names to be loaded (optional)
76
+ #
77
+ # @return A dataframe containing the data loaded from the relation
78
+ #
79
+ # USE:
80
+ #
81
+ # # When Post model is defined as:
82
+ # class Post < ActiveRecord::Base
83
+ # scope :active, -> { where.not(published_at: nil) }
84
+ # end
85
+ #
86
+ # # You can load active posts into a dataframe by:
87
+ # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
88
+ def from_activerecord(relation, *fields)
89
+ DaruLite::IO.from_activerecord relation, *fields
90
+ end
91
+
92
+ # Read the database from a plaintext file. For this method to work,
93
+ # the data should be present in a plain text file in columns. See
94
+ # spec/fixtures/bank2.dat for an example.
95
+ #
96
+ # == Arguments
97
+ #
98
+ # * path - Path of the file to be read.
99
+ # * fields - Vector names of the resulting database.
100
+ #
101
+ # == Usage
102
+ #
103
+ # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
104
+ def from_plaintext(path, fields)
105
+ DaruLite::IO.from_plaintext path, fields
106
+ end
107
+
108
+ def _load(data)
109
+ h = Marshal.load data
110
+ DaruLite::DataFrame.new(
111
+ h[:data],
112
+ index: h[:index],
113
+ order: h[:order],
114
+ name: h[:name]
115
+ )
116
+ end
117
+ end
118
+
119
+ def self.included(base)
120
+ base.extend ClassMethods
121
+ end
122
+
123
+ # Write this DataFrame to a CSV file.
124
+ #
125
+ # == Arguments
126
+ #
127
+ # * filename - Path of CSV file where the DataFrame is to be saved.
128
+ #
129
+ # == Options
130
+ #
131
+ # * convert_comma - If set to *true*, will convert any commas in any
132
+ # of the data to full stops ('.').
133
+ # All the options accepted by CSV.read() can also be passed into this
134
+ # function.
135
+ def write_csv(filename, opts = {})
136
+ DaruLite::IO.dataframe_write_csv self, filename, opts
137
+ end
138
+
139
+ # Write this dataframe to an Excel Spreadsheet
140
+ #
141
+ # == Arguments
142
+ #
143
+ # * filename - The path of the file where the DataFrame should be written.
144
+ def write_excel(filename, opts = {})
145
+ DaruLite::IO.dataframe_write_excel self, filename, opts
146
+ end
147
+
148
+ # Insert each case of the Dataset on the selected table
149
+ #
150
+ # == Arguments
151
+ #
152
+ # * dbh - DBI database connection object.
153
+ # * query - Query string.
154
+ #
155
+ # == Usage
156
+ #
157
+ # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
158
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
159
+ # ds.write_sql(dbh,"test")
160
+ def write_sql(dbh, table)
161
+ DaruLite::IO.dataframe_write_sql self, dbh, table
162
+ end
163
+
164
+ # Use marshalling to save dataframe to a file.
165
+ def save(filename)
166
+ DaruLite::IO.save self, filename
167
+ end
168
+
169
+ def _dump(_depth)
170
+ Marshal.dump(
171
+ data: @data,
172
+ index: @index.to_a,
173
+ order: @vectors.to_a,
174
+ name: @name
175
+ )
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,168 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Indexable
4
+ module SetSingleIndexStrategy
5
+ def self.uniq_size(df, col)
6
+ df[col].uniq.size
7
+ end
8
+
9
+ def self.new_index(df, col)
10
+ DaruLite::Index.new(df[col].to_a)
11
+ end
12
+
13
+ def self.delete_vector(df, col)
14
+ df.delete_vector(col)
15
+ end
16
+ end
17
+
18
+ module SetCategoricalIndexStrategy
19
+ def self.new_index(df, col)
20
+ DaruLite::CategoricalIndex.new(df[col].to_a)
21
+ end
22
+
23
+ def self.delete_vector(df, col)
24
+ df.delete_vector(col)
25
+ end
26
+ end
27
+
28
+ module SetMultiIndexStrategy
29
+ def self.uniq_size(df, cols)
30
+ df[*cols].uniq.size
31
+ end
32
+
33
+ def self.new_index(df, cols)
34
+ DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
35
+ mi.name = cols
36
+ end
37
+ end
38
+
39
+ def self.delete_vector(df, cols)
40
+ df.delete_vectors(*cols)
41
+ end
42
+ end
43
+
44
+ # Set a particular column as the new DF
45
+ def set_index(new_index_col, keep: false, categorical: false)
46
+ if categorical
47
+ strategy = SetCategoricalIndexStrategy
48
+ elsif new_index_col.respond_to?(:to_a)
49
+ strategy = SetMultiIndexStrategy
50
+ new_index_col = new_index_col.to_a
51
+ else
52
+ strategy = SetSingleIndexStrategy
53
+ end
54
+
55
+ unless categorical
56
+ uniq_size = strategy.uniq_size(self, new_index_col)
57
+ raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
58
+ end
59
+
60
+ self.index = strategy.new_index(self, new_index_col)
61
+ strategy.delete_vector(self, new_index_col) unless keep
62
+ self
63
+ end
64
+
65
+ # Change the index of the DataFrame and preserve the labels of the previous
66
+ # indexing. New index can be DaruLite::Index or any of its subclasses.
67
+ #
68
+ # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
69
+ # @example Reindexing DataFrame
70
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
71
+ # index: ['a','b','c','d'])
72
+ # #=>
73
+ # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
74
+ # # a b
75
+ # # a 1 11
76
+ # # b 2 22
77
+ # # c 3 33
78
+ # # d 4 44
79
+ # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
80
+ # #=>
81
+ # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
82
+ # # a b
83
+ # # b 2 22
84
+ # # 0 nil nil
85
+ # # a 1 11
86
+ # # g nil nil
87
+ def reindex(new_index)
88
+ unless new_index.is_a?(DaruLite::Index)
89
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
90
+ "subclasses, not #{new_index.class}"
91
+ end
92
+
93
+ cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
94
+ new_index.each_with_object(cl) do |idx, memo|
95
+ memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
96
+ end
97
+ end
98
+
99
+ def reset_index
100
+ index_df = index.to_df
101
+ names = index.name
102
+ names = [names] unless names.instance_of?(Array)
103
+ new_vectors = names + vectors.to_a
104
+ self.index = index_df.index
105
+ names.each do |name|
106
+ self[name] = index_df[name]
107
+ end
108
+ self.order = new_vectors
109
+ self
110
+ end
111
+
112
+ # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
113
+ #
114
+ # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
115
+ # are to be indexed.
116
+ # @example Reassigining index of a DataFrame
117
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
118
+ # df.index.to_a #=> [0,1,2,3]
119
+ #
120
+ # df.index = DaruLite::Index.new(['a','b','c','d'])
121
+ # df.index.to_a #=> ['a','b','c','d']
122
+ # df.row['a'].to_a #=> [1,11]
123
+ def index=(idx)
124
+ @index = Index.coerce idx
125
+ @data.each { |vec| vec.index = @index }
126
+
127
+ self
128
+ end
129
+
130
+ def reindex_vectors(new_vectors)
131
+ unless new_vectors.is_a?(DaruLite::Index)
132
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
133
+ "subclasses, not #{new_vectors.class}"
134
+ end
135
+
136
+ cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
137
+ new_vectors.each_with_object(cl) do |vec, memo|
138
+ memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
139
+ end
140
+ end
141
+
142
+ # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
143
+ #
144
+ # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
145
+ # be indexed. Must of the same size as ncols.
146
+ # @example Reassigning vectors of a DataFrame
147
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
148
+ # df.vectors.to_a #=> [:a, :b, :c]
149
+ #
150
+ # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
151
+ # df.vectors.to_a #=> [:foo, :bar, :baz]
152
+ def vectors=(new_index)
153
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
154
+
155
+ if new_index.size != ncols
156
+ raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
157
+ "dataframe size #{ncols}"
158
+ end
159
+
160
+ @vectors = new_index
161
+ @data.zip(new_index.to_a).each do |vect, name|
162
+ vect.name = name
163
+ end
164
+ self
165
+ end
166
+ end
167
+ end
168
+ end