daru_lite 0.1 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (73) hide show
  1. checksums.yaml +4 -4
  2. data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
  3. data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
  4. data/.github/workflows/ci.yml +20 -0
  5. data/.rubocop_todo.yml +35 -33
  6. data/README.md +19 -115
  7. data/daru_lite.gemspec +1 -0
  8. data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
  9. data/lib/daru_lite/data_frame/calculatable.rb +140 -0
  10. data/lib/daru_lite/data_frame/convertible.rb +107 -0
  11. data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
  12. data/lib/daru_lite/data_frame/fetchable.rb +301 -0
  13. data/lib/daru_lite/data_frame/filterable.rb +144 -0
  14. data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
  15. data/lib/daru_lite/data_frame/indexable.rb +168 -0
  16. data/lib/daru_lite/data_frame/iterable.rb +339 -0
  17. data/lib/daru_lite/data_frame/joinable.rb +152 -0
  18. data/lib/daru_lite/data_frame/missable.rb +75 -0
  19. data/lib/daru_lite/data_frame/pivotable.rb +108 -0
  20. data/lib/daru_lite/data_frame/queryable.rb +67 -0
  21. data/lib/daru_lite/data_frame/setable.rb +109 -0
  22. data/lib/daru_lite/data_frame/sortable.rb +241 -0
  23. data/lib/daru_lite/dataframe.rb +142 -2355
  24. data/lib/daru_lite/index/index.rb +13 -0
  25. data/lib/daru_lite/maths/statistics/vector.rb +1 -1
  26. data/lib/daru_lite/vector/aggregatable.rb +9 -0
  27. data/lib/daru_lite/vector/calculatable.rb +78 -0
  28. data/lib/daru_lite/vector/convertible.rb +77 -0
  29. data/lib/daru_lite/vector/duplicatable.rb +17 -0
  30. data/lib/daru_lite/vector/fetchable.rb +175 -0
  31. data/lib/daru_lite/vector/filterable.rb +128 -0
  32. data/lib/daru_lite/vector/indexable.rb +77 -0
  33. data/lib/daru_lite/vector/iterable.rb +95 -0
  34. data/lib/daru_lite/vector/joinable.rb +17 -0
  35. data/lib/daru_lite/vector/missable.rb +124 -0
  36. data/lib/daru_lite/vector/queryable.rb +45 -0
  37. data/lib/daru_lite/vector/setable.rb +47 -0
  38. data/lib/daru_lite/vector/sortable.rb +113 -0
  39. data/lib/daru_lite/vector.rb +36 -932
  40. data/lib/daru_lite/version.rb +1 -1
  41. data/spec/data_frame/aggregatable_example.rb +65 -0
  42. data/spec/data_frame/buildable_example.rb +109 -0
  43. data/spec/data_frame/calculatable_example.rb +135 -0
  44. data/spec/data_frame/convertible_example.rb +180 -0
  45. data/spec/data_frame/duplicatable_example.rb +111 -0
  46. data/spec/data_frame/fetchable_example.rb +476 -0
  47. data/spec/data_frame/filterable_example.rb +250 -0
  48. data/spec/data_frame/indexable_example.rb +221 -0
  49. data/spec/data_frame/iterable_example.rb +465 -0
  50. data/spec/data_frame/joinable_example.rb +106 -0
  51. data/spec/data_frame/missable_example.rb +47 -0
  52. data/spec/data_frame/pivotable_example.rb +297 -0
  53. data/spec/data_frame/queryable_example.rb +92 -0
  54. data/spec/data_frame/setable_example.rb +482 -0
  55. data/spec/data_frame/sortable_example.rb +350 -0
  56. data/spec/dataframe_spec.rb +181 -3243
  57. data/spec/index/index_spec.rb +8 -0
  58. data/spec/vector/aggregatable_example.rb +27 -0
  59. data/spec/vector/calculatable_example.rb +82 -0
  60. data/spec/vector/convertible_example.rb +126 -0
  61. data/spec/vector/duplicatable_example.rb +48 -0
  62. data/spec/vector/fetchable_example.rb +463 -0
  63. data/spec/vector/filterable_example.rb +165 -0
  64. data/spec/vector/indexable_example.rb +201 -0
  65. data/spec/vector/iterable_example.rb +111 -0
  66. data/spec/vector/joinable_example.rb +25 -0
  67. data/spec/vector/missable_example.rb +88 -0
  68. data/spec/vector/queryable_example.rb +91 -0
  69. data/spec/vector/setable_example.rb +300 -0
  70. data/spec/vector/sortable_example.rb +242 -0
  71. data/spec/vector_spec.rb +111 -1805
  72. metadata +102 -3
  73. data/.github/ISSUE_TEMPLATE.md +0 -18
@@ -0,0 +1,144 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Filterable
4
+ # Return unique rows by vector specified or all vectors
5
+ #
6
+ # @param vtrs [String][Symbol] vector names(s) that should be considered
7
+ #
8
+ # @example
9
+ #
10
+ # => #<DaruLite::DataFrame(6x2)>
11
+ # a b
12
+ # 0 1 a
13
+ # 1 2 b
14
+ # 2 3 c
15
+ # 3 4 d
16
+ # 2 3 c
17
+ # 3 4 f
18
+ #
19
+ # 2.3.3 :> df.uniq
20
+ # => #<DaruLite::DataFrame(5x2)>
21
+ # a b
22
+ # 0 1 a
23
+ # 1 2 b
24
+ # 2 3 c
25
+ # 3 4 d
26
+ # 3 4 f
27
+ #
28
+ # 2.3.3 :> df.uniq(:a)
29
+ # => #<DaruLite::DataFrame(5x2)>
30
+ # a b
31
+ # 0 1 a
32
+ # 1 2 b
33
+ # 2 3 c
34
+ # 3 4 d
35
+ #
36
+ def uniq(*vtrs)
37
+ vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
38
+ grouped = group_by(vecs)
39
+ indexes = grouped.groups.values.map { |v| v[0] }.sort
40
+ row[*indexes]
41
+ end
42
+
43
+ # Retain vectors or rows if the block returns a truthy value.
44
+ #
45
+ # == Description
46
+ #
47
+ # For filtering out certain rows/vectors based on their values,
48
+ # use the #filter method. By default it iterates over vectors and
49
+ # keeps those vectors for which the block returns true. It accepts
50
+ # an optional axis argument which lets you specify whether you want
51
+ # to iterate over vectors or rows.
52
+ #
53
+ # == Arguments
54
+ #
55
+ # * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
56
+ # Default to :vector.
57
+ #
58
+ # == Usage
59
+ #
60
+ # # Filter vectors
61
+ #
62
+ # df.filter do |vector|
63
+ # vector.type == :numeric and vector.median < 50
64
+ # end
65
+ #
66
+ # # Filter rows
67
+ #
68
+ # df.filter(:row) do |row|
69
+ # row[:a] + row[:d] < 100
70
+ # end
71
+ def filter(axis = :vector, &block)
72
+ dispatch_to_axis_pl axis, :filter, &block
73
+ end
74
+
75
+ # Returns a dataframe in which rows with any of the mentioned values
76
+ # are ignored.
77
+ # @param [Array] values to reject to form the new dataframe
78
+ # @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
79
+ # contain the mentioned values
80
+ # @example
81
+ # df = DaruLite::DataFrame.new({
82
+ # a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
83
+ # b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
84
+ # c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
85
+ # }, index: 11..18)
86
+ # df.reject_values nil, Float::NAN
87
+ # # => #<DaruLite::DataFrame(2x3)>
88
+ # # a b c
89
+ # # 11 1 a a
90
+ # # 18 7 8 7
91
+ def reject_values(*values)
92
+ positions =
93
+ size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
94
+ # Handle the case when positions size is 1 and #row_at wouldn't return a df
95
+ if positions.size == 1
96
+ pos = positions.first
97
+ row_at(pos..pos)
98
+ else
99
+ row_at(*positions)
100
+ end
101
+ end
102
+
103
+ def keep_row_if
104
+ @index.size.times
105
+ .reject { |position| yield(row_at(position)) }
106
+ .reverse_each { |position| delete_at_position(position) }
107
+ end
108
+
109
+ def keep_vector_if
110
+ @vectors.each do |vector|
111
+ delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
112
+ end
113
+ end
114
+
115
+ # creates a new vector with the data of a given field which the block returns true
116
+ def filter_vector(vec, &block)
117
+ DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
118
+ end
119
+
120
+ # Iterates over each row and retains it in a new DataFrame if the block returns
121
+ # true for that row.
122
+ def filter_rows
123
+ return to_enum(:filter_rows) unless block_given?
124
+
125
+ keep_rows = @index.map { |index| yield access_row(index) }
126
+
127
+ where keep_rows
128
+ end
129
+
130
+ # Iterates over each vector and retains it in a new DataFrame if the block returns
131
+ # true for that vector.
132
+ def filter_vectors(&block)
133
+ return to_enum(:filter_vectors) unless block
134
+
135
+ dup.tap { |df| df.keep_vector_if(&block) }
136
+ end
137
+
138
+ # Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
139
+ def where(bool_array)
140
+ DaruLite::Core::Query.df_where self, bool_array
141
+ end
142
+ end
143
+ end
144
+ end
@@ -0,0 +1,179 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module IOAble
4
+ module ClassMethods
5
+ # Load data from a CSV file. Specify an optional block to grab the CSV
6
+ # object and pre-condition it (for example use the `convert` or
7
+ # `header_convert` methods).
8
+ #
9
+ # == Arguments
10
+ #
11
+ # * path - Local path / Remote URL of the file to load specified as a String.
12
+ #
13
+ # == Options
14
+ #
15
+ # Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
16
+ # and uses those to eventually construct the resulting DataFrame.
17
+ #
18
+ # == Verbose Description
19
+ #
20
+ # You can specify all the options to the `.from_csv` function that you
21
+ # do to the Ruby `CSV.read()` function, since this is what is used internally.
22
+ #
23
+ # For example, if the columns in your CSV file are separated by something
24
+ # other that commas, you can use the `:col_sep` option. If you want to
25
+ # convert numeric values to numbers and not keep them as strings, you can
26
+ # use the `:converters` option and set it to `:numeric`.
27
+ #
28
+ # The `.from_csv` function uses the following defaults for reading CSV files
29
+ # (that are passed into the `CSV.read()` function):
30
+ #
31
+ # {
32
+ # :col_sep => ',',
33
+ # :converters => :numeric
34
+ # }
35
+ def from_csv(path, opts = {}, &block)
36
+ DaruLite::IO.from_csv path, opts, &block
37
+ end
38
+
39
+ # Read data from an Excel file into a DataFrame.
40
+ #
41
+ # == Arguments
42
+ #
43
+ # * path - Path of the file to be read.
44
+ #
45
+ # == Options
46
+ #
47
+ # *:worksheet_id - ID of the worksheet that is to be read.
48
+ def from_excel(path, opts = {}, &block)
49
+ DaruLite::IO.from_excel path, opts, &block
50
+ end
51
+
52
+ # Read a database query and returns a Dataset
53
+ #
54
+ # @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
55
+ # @param query [String] The query to be executed
56
+ #
57
+ # @return A dataframe containing the data resulting from the query
58
+ #
59
+ # USE:
60
+ #
61
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
62
+ # DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
63
+ #
64
+ # #Alternatively
65
+ #
66
+ # require 'dbi'
67
+ # DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
68
+ def from_sql(dbh, query)
69
+ DaruLite::IO.from_sql dbh, query
70
+ end
71
+
72
+ # Read a dataframe from AR::Relation
73
+ #
74
+ # @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
75
+ # @param fields [Array] Field names to be loaded (optional)
76
+ #
77
+ # @return A dataframe containing the data loaded from the relation
78
+ #
79
+ # USE:
80
+ #
81
+ # # When Post model is defined as:
82
+ # class Post < ActiveRecord::Base
83
+ # scope :active, -> { where.not(published_at: nil) }
84
+ # end
85
+ #
86
+ # # You can load active posts into a dataframe by:
87
+ # DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
88
+ def from_activerecord(relation, *fields)
89
+ DaruLite::IO.from_activerecord relation, *fields
90
+ end
91
+
92
+ # Read the database from a plaintext file. For this method to work,
93
+ # the data should be present in a plain text file in columns. See
94
+ # spec/fixtures/bank2.dat for an example.
95
+ #
96
+ # == Arguments
97
+ #
98
+ # * path - Path of the file to be read.
99
+ # * fields - Vector names of the resulting database.
100
+ #
101
+ # == Usage
102
+ #
103
+ # df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
104
+ def from_plaintext(path, fields)
105
+ DaruLite::IO.from_plaintext path, fields
106
+ end
107
+
108
+ def _load(data)
109
+ h = Marshal.load data
110
+ DaruLite::DataFrame.new(
111
+ h[:data],
112
+ index: h[:index],
113
+ order: h[:order],
114
+ name: h[:name]
115
+ )
116
+ end
117
+ end
118
+
119
+ def self.included(base)
120
+ base.extend ClassMethods
121
+ end
122
+
123
+ # Write this DataFrame to a CSV file.
124
+ #
125
+ # == Arguments
126
+ #
127
+ # * filename - Path of CSV file where the DataFrame is to be saved.
128
+ #
129
+ # == Options
130
+ #
131
+ # * convert_comma - If set to *true*, will convert any commas in any
132
+ # of the data to full stops ('.').
133
+ # All the options accepted by CSV.read() can also be passed into this
134
+ # function.
135
+ def write_csv(filename, opts = {})
136
+ DaruLite::IO.dataframe_write_csv self, filename, opts
137
+ end
138
+
139
+ # Write this dataframe to an Excel Spreadsheet
140
+ #
141
+ # == Arguments
142
+ #
143
+ # * filename - The path of the file where the DataFrame should be written.
144
+ def write_excel(filename, opts = {})
145
+ DaruLite::IO.dataframe_write_excel self, filename, opts
146
+ end
147
+
148
+ # Insert each case of the Dataset on the selected table
149
+ #
150
+ # == Arguments
151
+ #
152
+ # * dbh - DBI database connection object.
153
+ # * query - Query string.
154
+ #
155
+ # == Usage
156
+ #
157
+ # ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
158
+ # dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
159
+ # ds.write_sql(dbh,"test")
160
+ def write_sql(dbh, table)
161
+ DaruLite::IO.dataframe_write_sql self, dbh, table
162
+ end
163
+
164
+ # Use marshalling to save dataframe to a file.
165
+ def save(filename)
166
+ DaruLite::IO.save self, filename
167
+ end
168
+
169
+ def _dump(_depth)
170
+ Marshal.dump(
171
+ data: @data,
172
+ index: @index.to_a,
173
+ order: @vectors.to_a,
174
+ name: @name
175
+ )
176
+ end
177
+ end
178
+ end
179
+ end
@@ -0,0 +1,168 @@
1
+ module DaruLite
2
+ class DataFrame
3
+ module Indexable
4
+ module SetSingleIndexStrategy
5
+ def self.uniq_size(df, col)
6
+ df[col].uniq.size
7
+ end
8
+
9
+ def self.new_index(df, col)
10
+ DaruLite::Index.new(df[col].to_a)
11
+ end
12
+
13
+ def self.delete_vector(df, col)
14
+ df.delete_vector(col)
15
+ end
16
+ end
17
+
18
+ module SetCategoricalIndexStrategy
19
+ def self.new_index(df, col)
20
+ DaruLite::CategoricalIndex.new(df[col].to_a)
21
+ end
22
+
23
+ def self.delete_vector(df, col)
24
+ df.delete_vector(col)
25
+ end
26
+ end
27
+
28
+ module SetMultiIndexStrategy
29
+ def self.uniq_size(df, cols)
30
+ df[*cols].uniq.size
31
+ end
32
+
33
+ def self.new_index(df, cols)
34
+ DaruLite::MultiIndex.from_arrays(df[*cols].map_vectors(&:to_a)).tap do |mi|
35
+ mi.name = cols
36
+ end
37
+ end
38
+
39
+ def self.delete_vector(df, cols)
40
+ df.delete_vectors(*cols)
41
+ end
42
+ end
43
+
44
+ # Set a particular column as the new DF
45
+ def set_index(new_index_col, keep: false, categorical: false)
46
+ if categorical
47
+ strategy = SetCategoricalIndexStrategy
48
+ elsif new_index_col.respond_to?(:to_a)
49
+ strategy = SetMultiIndexStrategy
50
+ new_index_col = new_index_col.to_a
51
+ else
52
+ strategy = SetSingleIndexStrategy
53
+ end
54
+
55
+ unless categorical
56
+ uniq_size = strategy.uniq_size(self, new_index_col)
57
+ raise ArgumentError, 'All elements in new index must be unique.' if @size != uniq_size
58
+ end
59
+
60
+ self.index = strategy.new_index(self, new_index_col)
61
+ strategy.delete_vector(self, new_index_col) unless keep
62
+ self
63
+ end
64
+
65
+ # Change the index of the DataFrame and preserve the labels of the previous
66
+ # indexing. New index can be DaruLite::Index or any of its subclasses.
67
+ #
68
+ # @param [DaruLite::Index] new_index The new Index for reindexing the DataFrame.
69
+ # @example Reindexing DataFrame
70
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]},
71
+ # index: ['a','b','c','d'])
72
+ # #=>
73
+ # ##<DaruLite::DataFrame:83278130 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
74
+ # # a b
75
+ # # a 1 11
76
+ # # b 2 22
77
+ # # c 3 33
78
+ # # d 4 44
79
+ # df.reindex DaruLite::Index.new(['b', 0, 'a', 'g'])
80
+ # #=>
81
+ # ##<DaruLite::DataFrame:83177070 @name = b19277b8-c548-41da-ad9a-2ad8c060e273 @size = 4>
82
+ # # a b
83
+ # # b 2 22
84
+ # # 0 nil nil
85
+ # # a 1 11
86
+ # # g nil nil
87
+ def reindex(new_index)
88
+ unless new_index.is_a?(DaruLite::Index)
89
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
90
+ "subclasses, not #{new_index.class}"
91
+ end
92
+
93
+ cl = DaruLite::DataFrame.new({}, order: @vectors, index: new_index, name: @name)
94
+ new_index.each_with_object(cl) do |idx, memo|
95
+ memo.row[idx] = @index.include?(idx) ? row[idx] : Array.new(ncols)
96
+ end
97
+ end
98
+
99
+ def reset_index
100
+ index_df = index.to_df
101
+ names = index.name
102
+ names = [names] unless names.instance_of?(Array)
103
+ new_vectors = names + vectors.to_a
104
+ self.index = index_df.index
105
+ names.each do |name|
106
+ self[name] = index_df[name]
107
+ end
108
+ self.order = new_vectors
109
+ self
110
+ end
111
+
112
+ # Reassign index with a new index of type DaruLite::Index or any of its subclasses.
113
+ #
114
+ # @param [DaruLite::Index] idx New index object on which the rows of the dataframe
115
+ # are to be indexed.
116
+ # @example Reassigining index of a DataFrame
117
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [11,22,33,44]})
118
+ # df.index.to_a #=> [0,1,2,3]
119
+ #
120
+ # df.index = DaruLite::Index.new(['a','b','c','d'])
121
+ # df.index.to_a #=> ['a','b','c','d']
122
+ # df.row['a'].to_a #=> [1,11]
123
+ def index=(idx)
124
+ @index = Index.coerce idx
125
+ @data.each { |vec| vec.index = @index }
126
+
127
+ self
128
+ end
129
+
130
+ def reindex_vectors(new_vectors)
131
+ unless new_vectors.is_a?(DaruLite::Index)
132
+ raise ArgumentError, 'Must pass the new index of type Index or its ' \
133
+ "subclasses, not #{new_vectors.class}"
134
+ end
135
+
136
+ cl = DaruLite::DataFrame.new({}, order: new_vectors, index: @index, name: @name)
137
+ new_vectors.each_with_object(cl) do |vec, memo|
138
+ memo[vec] = @vectors.include?(vec) ? self[vec] : Array.new(nrows)
139
+ end
140
+ end
141
+
142
+ # Reassign vectors with a new index of type DaruLite::Index or any of its subclasses.
143
+ #
144
+ # @param new_index [DaruLite::Index] idx The new index object on which the vectors are to
145
+ # be indexed. Must of the same size as ncols.
146
+ # @example Reassigning vectors of a DataFrame
147
+ # df = DaruLite::DataFrame.new({a: [1,2,3,4], b: [:a,:b,:c,:d], c: [11,22,33,44]})
148
+ # df.vectors.to_a #=> [:a, :b, :c]
149
+ #
150
+ # df.vectors = DaruLite::Index.new([:foo, :bar, :baz])
151
+ # df.vectors.to_a #=> [:foo, :bar, :baz]
152
+ def vectors=(new_index)
153
+ raise ArgumentError, 'Can only reindex with Index and its subclasses' unless new_index.is_a?(DaruLite::Index)
154
+
155
+ if new_index.size != ncols
156
+ raise ArgumentError, "Specified index length #{new_index.size} not equal to" \
157
+ "dataframe size #{ncols}"
158
+ end
159
+
160
+ @vectors = new_index
161
+ @data.zip(new_index.to_a).each do |vect, name|
162
+ vect.name = name
163
+ end
164
+ self
165
+ end
166
+ end
167
+ end
168
+ end