daru_lite 0.1.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
@@ -0,0 +1,301 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Fetchable
|
4
|
+
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
5
|
+
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
6
|
+
# rows. Use df.row[:a] for accessing row with index ':a'.
|
7
|
+
def [](*names)
|
8
|
+
axis = extract_axis(names, :vector)
|
9
|
+
dispatch_to_axis axis, :access, *names
|
10
|
+
end
|
11
|
+
|
12
|
+
# Retrive rows by positions
|
13
|
+
# @param [Array<Integer>] positions of rows to retrive
|
14
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
15
|
+
# @example
|
16
|
+
# df = DaruLite::DataFrame.new({
|
17
|
+
# a: [1, 2, 3],
|
18
|
+
# b: ['a', 'b', 'c']
|
19
|
+
# })
|
20
|
+
# df.row_at 1, 2
|
21
|
+
# # => #<DaruLite::DataFrame(2x2)>
|
22
|
+
# # a b
|
23
|
+
# # 1 2 b
|
24
|
+
# # 2 3 c
|
25
|
+
def row_at(*positions)
|
26
|
+
original_positions = positions
|
27
|
+
positions = coerce_positions(*positions, nrows)
|
28
|
+
validate_positions(*positions, nrows)
|
29
|
+
|
30
|
+
if positions.is_a? Integer
|
31
|
+
row = get_rows_for([positions])
|
32
|
+
DaruLite::Vector.new(row, index: @vectors, name: @index.at(positions))
|
33
|
+
else
|
34
|
+
new_rows = get_rows_for(original_positions)
|
35
|
+
DaruLite::DataFrame.new(
|
36
|
+
new_rows,
|
37
|
+
index: @index.at(*original_positions),
|
38
|
+
order: @vectors,
|
39
|
+
name: @name
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Retrive vectors by positions
|
45
|
+
# @param [Array<Integer>] positions of vectors to retrive
|
46
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
47
|
+
# @example
|
48
|
+
# df = DaruLite::DataFrame.new({
|
49
|
+
# a: [1, 2, 3],
|
50
|
+
# b: ['a', 'b', 'c']
|
51
|
+
# })
|
52
|
+
# df.at 0
|
53
|
+
# # => #<DaruLite::Vector(3)>
|
54
|
+
# # a
|
55
|
+
# # 0 1
|
56
|
+
# # 1 2
|
57
|
+
# # 2 3
|
58
|
+
def at(*positions)
|
59
|
+
if AXES.include? positions.last
|
60
|
+
axis = positions.pop
|
61
|
+
return row_at(*positions) if axis == :row
|
62
|
+
end
|
63
|
+
|
64
|
+
original_positions = positions
|
65
|
+
positions = coerce_positions(*positions, ncols)
|
66
|
+
validate_positions(*positions, ncols)
|
67
|
+
|
68
|
+
if positions.is_a? Integer
|
69
|
+
@data[positions].dup
|
70
|
+
else
|
71
|
+
DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
|
72
|
+
index: @index,
|
73
|
+
order: @vectors.at(*original_positions),
|
74
|
+
name: @name
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# The first ten elements of the DataFrame
|
79
|
+
#
|
80
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
81
|
+
def head(quantity = 10)
|
82
|
+
row.at 0..(quantity - 1)
|
83
|
+
end
|
84
|
+
alias first head
|
85
|
+
|
86
|
+
# The last ten elements of the DataFrame
|
87
|
+
#
|
88
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
89
|
+
def tail(quantity = 10)
|
90
|
+
start = [-quantity, -size].max
|
91
|
+
row.at start..-1
|
92
|
+
end
|
93
|
+
alias last tail
|
94
|
+
|
95
|
+
# Extract a dataframe given row indexes or positions
|
96
|
+
# @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
|
97
|
+
# @return [DaruLite::Dataframe]
|
98
|
+
def get_sub_dataframe(keys, by_position: true)
|
99
|
+
return DaruLite::DataFrame.new({}) if keys == []
|
100
|
+
|
101
|
+
keys = @index.pos(*keys) unless by_position
|
102
|
+
|
103
|
+
sub_df = row_at(*keys)
|
104
|
+
sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
|
105
|
+
|
106
|
+
sub_df
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_vector_anyways(v)
|
110
|
+
@vectors.include?(v) ? self[v].to_a : Array.new(size)
|
111
|
+
end
|
112
|
+
|
113
|
+
# @param indexes [Array] index(s) at which row tuples are retrieved
|
114
|
+
# @return [Array] returns array of row tuples at given index(s)
|
115
|
+
# @example Using DaruLite::Index
|
116
|
+
# df = DaruLite::DataFrame.new({
|
117
|
+
# a: [1, 2, 3],
|
118
|
+
# b: ['a', 'a', 'b']
|
119
|
+
# })
|
120
|
+
#
|
121
|
+
# df.access_row_tuples_by_indexs(1,2)
|
122
|
+
# # => [[2, "a"], [3, "b"]]
|
123
|
+
#
|
124
|
+
# df.index = DaruLite::Index.new([:one,:two,:three])
|
125
|
+
# df.access_row_tuples_by_indexs(:one,:three)
|
126
|
+
# # => [[1, "a"], [3, "b"]]
|
127
|
+
#
|
128
|
+
# @example Using DaruLite::MultiIndex
|
129
|
+
# mi_idx = DaruLite::MultiIndex.from_tuples [
|
130
|
+
# [:a,:one,:bar],
|
131
|
+
# [:a,:one,:baz],
|
132
|
+
# [:b,:two,:bar],
|
133
|
+
# [:a,:two,:baz],
|
134
|
+
# ]
|
135
|
+
# df_mi = DaruLite::DataFrame.new({
|
136
|
+
# a: 1..4,
|
137
|
+
# b: 'a'..'d'
|
138
|
+
# }, index: mi_idx )
|
139
|
+
#
|
140
|
+
# df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
|
141
|
+
# # => [[3, "c"]]
|
142
|
+
# df_mi.access_row_tuples_by_indexs(:a)
|
143
|
+
# # => [[1, "a"], [2, "b"], [4, "d"]]
|
144
|
+
def access_row_tuples_by_indexs(*indexes)
|
145
|
+
return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
|
146
|
+
@index.is_a?(DaruLite::MultiIndex)
|
147
|
+
|
148
|
+
positions = @index.pos(*indexes)
|
149
|
+
if positions.is_a? Numeric
|
150
|
+
row = get_rows_for([positions])
|
151
|
+
row.first.is_a?(Array) ? row : [row]
|
152
|
+
else
|
153
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
154
|
+
indexes.map { |index| new_rows.map { |r| r[index] } }
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# Split the dataframe into many dataframes based on category vector
|
159
|
+
# @param [object] cat_name name of category vector to split the dataframe
|
160
|
+
# @return [Array] array of dataframes split by category with category vector
|
161
|
+
# used to split not included
|
162
|
+
# @example
|
163
|
+
# df = DaruLite::DataFrame.new({
|
164
|
+
# a: [1, 2, 3],
|
165
|
+
# b: ['a', 'a', 'b']
|
166
|
+
# })
|
167
|
+
# df.to_category :b
|
168
|
+
# df.split_by_category :b
|
169
|
+
# # => [#<DaruLite::DataFrame: a (2x1)>
|
170
|
+
# # a
|
171
|
+
# # 0 1
|
172
|
+
# # 1 2,
|
173
|
+
# # #<DaruLite::DataFrame: b (1x1)>
|
174
|
+
# # a
|
175
|
+
# # 2 3]
|
176
|
+
def split_by_category(cat_name)
|
177
|
+
cat_dv = self[cat_name]
|
178
|
+
raise ArgumentError, "#{cat_name} is not a category vector" unless
|
179
|
+
cat_dv.category?
|
180
|
+
|
181
|
+
cat_dv.categories.map do |cat|
|
182
|
+
where(cat_dv.eq cat)
|
183
|
+
.rename(cat)
|
184
|
+
.delete_vector cat_name
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
189
|
+
# alongwith numbers.
|
190
|
+
def numeric_vectors
|
191
|
+
# FIXME: Why _with_index ?..
|
192
|
+
each_vector_with_index
|
193
|
+
.select { |vec, _i| vec.numeric? }
|
194
|
+
.map(&:last)
|
195
|
+
end
|
196
|
+
|
197
|
+
def numeric_vector_names
|
198
|
+
@vectors.select { |v| self[v].numeric? }
|
199
|
+
end
|
200
|
+
|
201
|
+
# Return a DataFrame of only the numerical Vectors. If clone: false
|
202
|
+
# is specified as option, only a *view* of the Vectors will be
|
203
|
+
# returned. Defaults to clone: true.
|
204
|
+
def only_numerics(opts = {})
|
205
|
+
cln = opts[:clone] != false
|
206
|
+
arry = numeric_vectors.map { |v| self[v] }
|
207
|
+
|
208
|
+
order = Index.new(numeric_vectors)
|
209
|
+
DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
210
|
+
end
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
def access_vector(*names)
|
215
|
+
if names.first.is_a?(Range)
|
216
|
+
dup(@vectors.subset(names.first))
|
217
|
+
elsif @vectors.is_a?(MultiIndex)
|
218
|
+
access_vector_multi_index(*names)
|
219
|
+
else
|
220
|
+
access_vector_single_index(*names)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def access_vector_multi_index(*names)
|
225
|
+
pos = @vectors[names]
|
226
|
+
|
227
|
+
return @data[pos] if pos.is_a?(Integer)
|
228
|
+
|
229
|
+
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
230
|
+
|
231
|
+
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
232
|
+
|
233
|
+
DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
|
234
|
+
end
|
235
|
+
|
236
|
+
def access_vector_single_index(*names)
|
237
|
+
if names.count < 2
|
238
|
+
begin
|
239
|
+
pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
|
240
|
+
rescue IndexError
|
241
|
+
raise IndexError, "Specified vector #{names.first} does not exist"
|
242
|
+
end
|
243
|
+
return @data[pos] if pos.is_a?(Numeric)
|
244
|
+
|
245
|
+
names = pos
|
246
|
+
end
|
247
|
+
|
248
|
+
new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
|
249
|
+
|
250
|
+
order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
|
251
|
+
DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
|
252
|
+
end
|
253
|
+
|
254
|
+
def access_row(*indexes)
|
255
|
+
positions = @index.pos(*indexes)
|
256
|
+
|
257
|
+
if positions.is_a? Numeric
|
258
|
+
row = get_rows_for([positions])
|
259
|
+
DaruLite::Vector.new row, index: @vectors, name: indexes.first
|
260
|
+
else
|
261
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
262
|
+
DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
|
267
|
+
# because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
|
268
|
+
# values (representing a row) or an array of Vectors (that can be seen as rows)
|
269
|
+
def get_rows_for(keys, by_position: true)
|
270
|
+
raise unless keys.is_a?(Array)
|
271
|
+
|
272
|
+
if by_position
|
273
|
+
pos = keys
|
274
|
+
@data.map { |vector| vector.at(*pos) }
|
275
|
+
else
|
276
|
+
# TODO: for now (2018-07-27), it is different than using
|
277
|
+
# get_rows_for(@index.pos(*keys))
|
278
|
+
# because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
|
279
|
+
indexes = keys
|
280
|
+
@data.map { |vec| vec[*indexes] }
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# coerce ranges, integers and array in appropriate ways
|
285
|
+
def coerce_positions(*positions, size)
|
286
|
+
if positions.size == 1
|
287
|
+
case positions.first
|
288
|
+
when Integer
|
289
|
+
positions.first
|
290
|
+
when Range
|
291
|
+
size.times.to_a[positions.first]
|
292
|
+
else
|
293
|
+
raise ArgumentError, 'Unknown position type.'
|
294
|
+
end
|
295
|
+
else
|
296
|
+
positions
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Filterable
|
4
|
+
# Return unique rows by vector specified or all vectors
|
5
|
+
#
|
6
|
+
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
7
|
+
#
|
8
|
+
# @example
|
9
|
+
#
|
10
|
+
# => #<DaruLite::DataFrame(6x2)>
|
11
|
+
# a b
|
12
|
+
# 0 1 a
|
13
|
+
# 1 2 b
|
14
|
+
# 2 3 c
|
15
|
+
# 3 4 d
|
16
|
+
# 2 3 c
|
17
|
+
# 3 4 f
|
18
|
+
#
|
19
|
+
# 2.3.3 :> df.uniq
|
20
|
+
# => #<DaruLite::DataFrame(5x2)>
|
21
|
+
# a b
|
22
|
+
# 0 1 a
|
23
|
+
# 1 2 b
|
24
|
+
# 2 3 c
|
25
|
+
# 3 4 d
|
26
|
+
# 3 4 f
|
27
|
+
#
|
28
|
+
# 2.3.3 :> df.uniq(:a)
|
29
|
+
# => #<DaruLite::DataFrame(5x2)>
|
30
|
+
# a b
|
31
|
+
# 0 1 a
|
32
|
+
# 1 2 b
|
33
|
+
# 2 3 c
|
34
|
+
# 3 4 d
|
35
|
+
#
|
36
|
+
def uniq(*vtrs)
|
37
|
+
vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
|
38
|
+
grouped = group_by(vecs)
|
39
|
+
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
40
|
+
row[*indexes]
|
41
|
+
end
|
42
|
+
|
43
|
+
# Retain vectors or rows if the block returns a truthy value.
|
44
|
+
#
|
45
|
+
# == Description
|
46
|
+
#
|
47
|
+
# For filtering out certain rows/vectors based on their values,
|
48
|
+
# use the #filter method. By default it iterates over vectors and
|
49
|
+
# keeps those vectors for which the block returns true. It accepts
|
50
|
+
# an optional axis argument which lets you specify whether you want
|
51
|
+
# to iterate over vectors or rows.
|
52
|
+
#
|
53
|
+
# == Arguments
|
54
|
+
#
|
55
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
56
|
+
# Default to :vector.
|
57
|
+
#
|
58
|
+
# == Usage
|
59
|
+
#
|
60
|
+
# # Filter vectors
|
61
|
+
#
|
62
|
+
# df.filter do |vector|
|
63
|
+
# vector.type == :numeric and vector.median < 50
|
64
|
+
# end
|
65
|
+
#
|
66
|
+
# # Filter rows
|
67
|
+
#
|
68
|
+
# df.filter(:row) do |row|
|
69
|
+
# row[:a] + row[:d] < 100
|
70
|
+
# end
|
71
|
+
def filter(axis = :vector, &block)
|
72
|
+
dispatch_to_axis_pl axis, :filter, &block
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns a dataframe in which rows with any of the mentioned values
|
76
|
+
# are ignored.
|
77
|
+
# @param [Array] values to reject to form the new dataframe
|
78
|
+
# @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
|
79
|
+
# contain the mentioned values
|
80
|
+
# @example
|
81
|
+
# df = DaruLite::DataFrame.new({
|
82
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
83
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
84
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
85
|
+
# }, index: 11..18)
|
86
|
+
# df.reject_values nil, Float::NAN
|
87
|
+
# # => #<DaruLite::DataFrame(2x3)>
|
88
|
+
# # a b c
|
89
|
+
# # 11 1 a a
|
90
|
+
# # 18 7 8 7
|
91
|
+
def reject_values(*values)
|
92
|
+
positions =
|
93
|
+
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
94
|
+
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
95
|
+
if positions.size == 1
|
96
|
+
pos = positions.first
|
97
|
+
row_at(pos..pos)
|
98
|
+
else
|
99
|
+
row_at(*positions)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def keep_row_if
|
104
|
+
@index.size.times
|
105
|
+
.reject { |position| yield(row_at(position)) }
|
106
|
+
.reverse_each { |position| delete_at_position(position) }
|
107
|
+
end
|
108
|
+
|
109
|
+
def keep_vector_if
|
110
|
+
@vectors.each do |vector|
|
111
|
+
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# creates a new vector with the data of a given field which the block returns true
|
116
|
+
def filter_vector(vec, &block)
|
117
|
+
DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
|
118
|
+
end
|
119
|
+
|
120
|
+
# Iterates over each row and retains it in a new DataFrame if the block returns
|
121
|
+
# true for that row.
|
122
|
+
def filter_rows
|
123
|
+
return to_enum(:filter_rows) unless block_given?
|
124
|
+
|
125
|
+
keep_rows = @index.map { |index| yield access_row(index) }
|
126
|
+
|
127
|
+
where keep_rows
|
128
|
+
end
|
129
|
+
|
130
|
+
# Iterates over each vector and retains it in a new DataFrame if the block returns
|
131
|
+
# true for that vector.
|
132
|
+
def filter_vectors(&block)
|
133
|
+
return to_enum(:filter_vectors) unless block
|
134
|
+
|
135
|
+
dup.tap { |df| df.keep_vector_if(&block) }
|
136
|
+
end
|
137
|
+
|
138
|
+
# Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
|
139
|
+
def where(bool_array)
|
140
|
+
DaruLite::Core::Query.df_where self, bool_array
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module IOAble
|
4
|
+
module ClassMethods
|
5
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
6
|
+
# object and pre-condition it (for example use the `convert` or
|
7
|
+
# `header_convert` methods).
|
8
|
+
#
|
9
|
+
# == Arguments
|
10
|
+
#
|
11
|
+
# * path - Local path / Remote URL of the file to load specified as a String.
|
12
|
+
#
|
13
|
+
# == Options
|
14
|
+
#
|
15
|
+
# Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
|
16
|
+
# and uses those to eventually construct the resulting DataFrame.
|
17
|
+
#
|
18
|
+
# == Verbose Description
|
19
|
+
#
|
20
|
+
# You can specify all the options to the `.from_csv` function that you
|
21
|
+
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
22
|
+
#
|
23
|
+
# For example, if the columns in your CSV file are separated by something
|
24
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
25
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
26
|
+
# use the `:converters` option and set it to `:numeric`.
|
27
|
+
#
|
28
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
29
|
+
# (that are passed into the `CSV.read()` function):
|
30
|
+
#
|
31
|
+
# {
|
32
|
+
# :col_sep => ',',
|
33
|
+
# :converters => :numeric
|
34
|
+
# }
|
35
|
+
def from_csv(path, opts = {}, &block)
|
36
|
+
DaruLite::IO.from_csv path, opts, &block
|
37
|
+
end
|
38
|
+
|
39
|
+
# Read data from an Excel file into a DataFrame.
|
40
|
+
#
|
41
|
+
# == Arguments
|
42
|
+
#
|
43
|
+
# * path - Path of the file to be read.
|
44
|
+
#
|
45
|
+
# == Options
|
46
|
+
#
|
47
|
+
# *:worksheet_id - ID of the worksheet that is to be read.
|
48
|
+
def from_excel(path, opts = {}, &block)
|
49
|
+
DaruLite::IO.from_excel path, opts, &block
|
50
|
+
end
|
51
|
+
|
52
|
+
# Read a database query and returns a Dataset
|
53
|
+
#
|
54
|
+
# @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
55
|
+
# @param query [String] The query to be executed
|
56
|
+
#
|
57
|
+
# @return A dataframe containing the data resulting from the query
|
58
|
+
#
|
59
|
+
# USE:
|
60
|
+
#
|
61
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
62
|
+
# DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
63
|
+
#
|
64
|
+
# #Alternatively
|
65
|
+
#
|
66
|
+
# require 'dbi'
|
67
|
+
# DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
|
68
|
+
def from_sql(dbh, query)
|
69
|
+
DaruLite::IO.from_sql dbh, query
|
70
|
+
end
|
71
|
+
|
72
|
+
# Read a dataframe from AR::Relation
|
73
|
+
#
|
74
|
+
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
75
|
+
# @param fields [Array] Field names to be loaded (optional)
|
76
|
+
#
|
77
|
+
# @return A dataframe containing the data loaded from the relation
|
78
|
+
#
|
79
|
+
# USE:
|
80
|
+
#
|
81
|
+
# # When Post model is defined as:
|
82
|
+
# class Post < ActiveRecord::Base
|
83
|
+
# scope :active, -> { where.not(published_at: nil) }
|
84
|
+
# end
|
85
|
+
#
|
86
|
+
# # You can load active posts into a dataframe by:
|
87
|
+
# DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
88
|
+
def from_activerecord(relation, *fields)
|
89
|
+
DaruLite::IO.from_activerecord relation, *fields
|
90
|
+
end
|
91
|
+
|
92
|
+
# Read the database from a plaintext file. For this method to work,
|
93
|
+
# the data should be present in a plain text file in columns. See
|
94
|
+
# spec/fixtures/bank2.dat for an example.
|
95
|
+
#
|
96
|
+
# == Arguments
|
97
|
+
#
|
98
|
+
# * path - Path of the file to be read.
|
99
|
+
# * fields - Vector names of the resulting database.
|
100
|
+
#
|
101
|
+
# == Usage
|
102
|
+
#
|
103
|
+
# df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
104
|
+
def from_plaintext(path, fields)
|
105
|
+
DaruLite::IO.from_plaintext path, fields
|
106
|
+
end
|
107
|
+
|
108
|
+
def _load(data)
|
109
|
+
h = Marshal.load data
|
110
|
+
DaruLite::DataFrame.new(
|
111
|
+
h[:data],
|
112
|
+
index: h[:index],
|
113
|
+
order: h[:order],
|
114
|
+
name: h[:name]
|
115
|
+
)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.included(base)
|
120
|
+
base.extend ClassMethods
|
121
|
+
end
|
122
|
+
|
123
|
+
# Write this DataFrame to a CSV file.
|
124
|
+
#
|
125
|
+
# == Arguments
|
126
|
+
#
|
127
|
+
# * filename - Path of CSV file where the DataFrame is to be saved.
|
128
|
+
#
|
129
|
+
# == Options
|
130
|
+
#
|
131
|
+
# * convert_comma - If set to *true*, will convert any commas in any
|
132
|
+
# of the data to full stops ('.').
|
133
|
+
# All the options accepted by CSV.read() can also be passed into this
|
134
|
+
# function.
|
135
|
+
def write_csv(filename, opts = {})
|
136
|
+
DaruLite::IO.dataframe_write_csv self, filename, opts
|
137
|
+
end
|
138
|
+
|
139
|
+
# Write this dataframe to an Excel Spreadsheet
|
140
|
+
#
|
141
|
+
# == Arguments
|
142
|
+
#
|
143
|
+
# * filename - The path of the file where the DataFrame should be written.
|
144
|
+
def write_excel(filename, opts = {})
|
145
|
+
DaruLite::IO.dataframe_write_excel self, filename, opts
|
146
|
+
end
|
147
|
+
|
148
|
+
# Insert each case of the Dataset on the selected table
|
149
|
+
#
|
150
|
+
# == Arguments
|
151
|
+
#
|
152
|
+
# * dbh - DBI database connection object.
|
153
|
+
# * query - Query string.
|
154
|
+
#
|
155
|
+
# == Usage
|
156
|
+
#
|
157
|
+
# ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
|
158
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
159
|
+
# ds.write_sql(dbh,"test")
|
160
|
+
def write_sql(dbh, table)
|
161
|
+
DaruLite::IO.dataframe_write_sql self, dbh, table
|
162
|
+
end
|
163
|
+
|
164
|
+
# Use marshalling to save dataframe to a file.
|
165
|
+
def save(filename)
|
166
|
+
DaruLite::IO.save self, filename
|
167
|
+
end
|
168
|
+
|
169
|
+
def _dump(_depth)
|
170
|
+
Marshal.dump(
|
171
|
+
data: @data,
|
172
|
+
index: @index.to_a,
|
173
|
+
order: @vectors.to_a,
|
174
|
+
name: @name
|
175
|
+
)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|