daru_lite 0.1.1 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/.rubocop_todo.yml +35 -33
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +138 -2353
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3289
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +86 -2
@@ -0,0 +1,301 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Fetchable
|
4
|
+
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
5
|
+
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
6
|
+
# rows. Use df.row[:a] for accessing row with index ':a'.
|
7
|
+
def [](*names)
|
8
|
+
axis = extract_axis(names, :vector)
|
9
|
+
dispatch_to_axis axis, :access, *names
|
10
|
+
end
|
11
|
+
|
12
|
+
# Retrive rows by positions
|
13
|
+
# @param [Array<Integer>] positions of rows to retrive
|
14
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
15
|
+
# @example
|
16
|
+
# df = DaruLite::DataFrame.new({
|
17
|
+
# a: [1, 2, 3],
|
18
|
+
# b: ['a', 'b', 'c']
|
19
|
+
# })
|
20
|
+
# df.row_at 1, 2
|
21
|
+
# # => #<DaruLite::DataFrame(2x2)>
|
22
|
+
# # a b
|
23
|
+
# # 1 2 b
|
24
|
+
# # 2 3 c
|
25
|
+
def row_at(*positions)
|
26
|
+
original_positions = positions
|
27
|
+
positions = coerce_positions(*positions, nrows)
|
28
|
+
validate_positions(*positions, nrows)
|
29
|
+
|
30
|
+
if positions.is_a? Integer
|
31
|
+
row = get_rows_for([positions])
|
32
|
+
DaruLite::Vector.new(row, index: @vectors, name: @index.at(positions))
|
33
|
+
else
|
34
|
+
new_rows = get_rows_for(original_positions)
|
35
|
+
DaruLite::DataFrame.new(
|
36
|
+
new_rows,
|
37
|
+
index: @index.at(*original_positions),
|
38
|
+
order: @vectors,
|
39
|
+
name: @name
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Retrive vectors by positions
|
45
|
+
# @param [Array<Integer>] positions of vectors to retrive
|
46
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
47
|
+
# @example
|
48
|
+
# df = DaruLite::DataFrame.new({
|
49
|
+
# a: [1, 2, 3],
|
50
|
+
# b: ['a', 'b', 'c']
|
51
|
+
# })
|
52
|
+
# df.at 0
|
53
|
+
# # => #<DaruLite::Vector(3)>
|
54
|
+
# # a
|
55
|
+
# # 0 1
|
56
|
+
# # 1 2
|
57
|
+
# # 2 3
|
58
|
+
def at(*positions)
|
59
|
+
if AXES.include? positions.last
|
60
|
+
axis = positions.pop
|
61
|
+
return row_at(*positions) if axis == :row
|
62
|
+
end
|
63
|
+
|
64
|
+
original_positions = positions
|
65
|
+
positions = coerce_positions(*positions, ncols)
|
66
|
+
validate_positions(*positions, ncols)
|
67
|
+
|
68
|
+
if positions.is_a? Integer
|
69
|
+
@data[positions].dup
|
70
|
+
else
|
71
|
+
DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
|
72
|
+
index: @index,
|
73
|
+
order: @vectors.at(*original_positions),
|
74
|
+
name: @name
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# The first ten elements of the DataFrame
|
79
|
+
#
|
80
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
81
|
+
def head(quantity = 10)
|
82
|
+
row.at 0..(quantity - 1)
|
83
|
+
end
|
84
|
+
alias first head
|
85
|
+
|
86
|
+
# The last ten elements of the DataFrame
|
87
|
+
#
|
88
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
89
|
+
def tail(quantity = 10)
|
90
|
+
start = [-quantity, -size].max
|
91
|
+
row.at start..-1
|
92
|
+
end
|
93
|
+
alias last tail
|
94
|
+
|
95
|
+
# Extract a dataframe given row indexes or positions
|
96
|
+
# @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
|
97
|
+
# @return [DaruLite::Dataframe]
|
98
|
+
def get_sub_dataframe(keys, by_position: true)
|
99
|
+
return DaruLite::DataFrame.new({}) if keys == []
|
100
|
+
|
101
|
+
keys = @index.pos(*keys) unless by_position
|
102
|
+
|
103
|
+
sub_df = row_at(*keys)
|
104
|
+
sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
|
105
|
+
|
106
|
+
sub_df
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_vector_anyways(v)
|
110
|
+
@vectors.include?(v) ? self[v].to_a : Array.new(size)
|
111
|
+
end
|
112
|
+
|
113
|
+
# @param indexes [Array] index(s) at which row tuples are retrieved
|
114
|
+
# @return [Array] returns array of row tuples at given index(s)
|
115
|
+
# @example Using DaruLite::Index
|
116
|
+
# df = DaruLite::DataFrame.new({
|
117
|
+
# a: [1, 2, 3],
|
118
|
+
# b: ['a', 'a', 'b']
|
119
|
+
# })
|
120
|
+
#
|
121
|
+
# df.access_row_tuples_by_indexs(1,2)
|
122
|
+
# # => [[2, "a"], [3, "b"]]
|
123
|
+
#
|
124
|
+
# df.index = DaruLite::Index.new([:one,:two,:three])
|
125
|
+
# df.access_row_tuples_by_indexs(:one,:three)
|
126
|
+
# # => [[1, "a"], [3, "b"]]
|
127
|
+
#
|
128
|
+
# @example Using DaruLite::MultiIndex
|
129
|
+
# mi_idx = DaruLite::MultiIndex.from_tuples [
|
130
|
+
# [:a,:one,:bar],
|
131
|
+
# [:a,:one,:baz],
|
132
|
+
# [:b,:two,:bar],
|
133
|
+
# [:a,:two,:baz],
|
134
|
+
# ]
|
135
|
+
# df_mi = DaruLite::DataFrame.new({
|
136
|
+
# a: 1..4,
|
137
|
+
# b: 'a'..'d'
|
138
|
+
# }, index: mi_idx )
|
139
|
+
#
|
140
|
+
# df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
|
141
|
+
# # => [[3, "c"]]
|
142
|
+
# df_mi.access_row_tuples_by_indexs(:a)
|
143
|
+
# # => [[1, "a"], [2, "b"], [4, "d"]]
|
144
|
+
def access_row_tuples_by_indexs(*indexes)
|
145
|
+
return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
|
146
|
+
@index.is_a?(DaruLite::MultiIndex)
|
147
|
+
|
148
|
+
positions = @index.pos(*indexes)
|
149
|
+
if positions.is_a? Numeric
|
150
|
+
row = get_rows_for([positions])
|
151
|
+
row.first.is_a?(Array) ? row : [row]
|
152
|
+
else
|
153
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
154
|
+
indexes.map { |index| new_rows.map { |r| r[index] } }
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# Split the dataframe into many dataframes based on category vector
|
159
|
+
# @param [object] cat_name name of category vector to split the dataframe
|
160
|
+
# @return [Array] array of dataframes split by category with category vector
|
161
|
+
# used to split not included
|
162
|
+
# @example
|
163
|
+
# df = DaruLite::DataFrame.new({
|
164
|
+
# a: [1, 2, 3],
|
165
|
+
# b: ['a', 'a', 'b']
|
166
|
+
# })
|
167
|
+
# df.to_category :b
|
168
|
+
# df.split_by_category :b
|
169
|
+
# # => [#<DaruLite::DataFrame: a (2x1)>
|
170
|
+
# # a
|
171
|
+
# # 0 1
|
172
|
+
# # 1 2,
|
173
|
+
# # #<DaruLite::DataFrame: b (1x1)>
|
174
|
+
# # a
|
175
|
+
# # 2 3]
|
176
|
+
def split_by_category(cat_name)
|
177
|
+
cat_dv = self[cat_name]
|
178
|
+
raise ArgumentError, "#{cat_name} is not a category vector" unless
|
179
|
+
cat_dv.category?
|
180
|
+
|
181
|
+
cat_dv.categories.map do |cat|
|
182
|
+
where(cat_dv.eq cat)
|
183
|
+
.rename(cat)
|
184
|
+
.delete_vector cat_name
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
189
|
+
# alongwith numbers.
|
190
|
+
def numeric_vectors
|
191
|
+
# FIXME: Why _with_index ?..
|
192
|
+
each_vector_with_index
|
193
|
+
.select { |vec, _i| vec.numeric? }
|
194
|
+
.map(&:last)
|
195
|
+
end
|
196
|
+
|
197
|
+
def numeric_vector_names
|
198
|
+
@vectors.select { |v| self[v].numeric? }
|
199
|
+
end
|
200
|
+
|
201
|
+
# Return a DataFrame of only the numerical Vectors. If clone: false
|
202
|
+
# is specified as option, only a *view* of the Vectors will be
|
203
|
+
# returned. Defaults to clone: true.
|
204
|
+
def only_numerics(opts = {})
|
205
|
+
cln = opts[:clone] != false
|
206
|
+
arry = numeric_vectors.map { |v| self[v] }
|
207
|
+
|
208
|
+
order = Index.new(numeric_vectors)
|
209
|
+
DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
210
|
+
end
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
def access_vector(*names)
|
215
|
+
if names.first.is_a?(Range)
|
216
|
+
dup(@vectors.subset(names.first))
|
217
|
+
elsif @vectors.is_a?(MultiIndex)
|
218
|
+
access_vector_multi_index(*names)
|
219
|
+
else
|
220
|
+
access_vector_single_index(*names)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def access_vector_multi_index(*names)
|
225
|
+
pos = @vectors[names]
|
226
|
+
|
227
|
+
return @data[pos] if pos.is_a?(Integer)
|
228
|
+
|
229
|
+
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
230
|
+
|
231
|
+
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
232
|
+
|
233
|
+
DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
|
234
|
+
end
|
235
|
+
|
236
|
+
def access_vector_single_index(*names)
|
237
|
+
if names.count < 2
|
238
|
+
begin
|
239
|
+
pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
|
240
|
+
rescue IndexError
|
241
|
+
raise IndexError, "Specified vector #{names.first} does not exist"
|
242
|
+
end
|
243
|
+
return @data[pos] if pos.is_a?(Numeric)
|
244
|
+
|
245
|
+
names = pos
|
246
|
+
end
|
247
|
+
|
248
|
+
new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
|
249
|
+
|
250
|
+
order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
|
251
|
+
DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
|
252
|
+
end
|
253
|
+
|
254
|
+
def access_row(*indexes)
|
255
|
+
positions = @index.pos(*indexes)
|
256
|
+
|
257
|
+
if positions.is_a? Numeric
|
258
|
+
row = get_rows_for([positions])
|
259
|
+
DaruLite::Vector.new row, index: @vectors, name: indexes.first
|
260
|
+
else
|
261
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
262
|
+
DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
|
267
|
+
# because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
|
268
|
+
# values (representing a row) or an array of Vectors (that can be seen as rows)
|
269
|
+
def get_rows_for(keys, by_position: true)
|
270
|
+
raise unless keys.is_a?(Array)
|
271
|
+
|
272
|
+
if by_position
|
273
|
+
pos = keys
|
274
|
+
@data.map { |vector| vector.at(*pos) }
|
275
|
+
else
|
276
|
+
# TODO: for now (2018-07-27), it is different than using
|
277
|
+
# get_rows_for(@index.pos(*keys))
|
278
|
+
# because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
|
279
|
+
indexes = keys
|
280
|
+
@data.map { |vec| vec[*indexes] }
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# coerce ranges, integers and array in appropriate ways
|
285
|
+
def coerce_positions(*positions, size)
|
286
|
+
if positions.size == 1
|
287
|
+
case positions.first
|
288
|
+
when Integer
|
289
|
+
positions.first
|
290
|
+
when Range
|
291
|
+
size.times.to_a[positions.first]
|
292
|
+
else
|
293
|
+
raise ArgumentError, 'Unknown position type.'
|
294
|
+
end
|
295
|
+
else
|
296
|
+
positions
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|
@@ -0,0 +1,144 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Filterable
|
4
|
+
# Return unique rows by vector specified or all vectors
|
5
|
+
#
|
6
|
+
# @param vtrs [String][Symbol] vector names(s) that should be considered
|
7
|
+
#
|
8
|
+
# @example
|
9
|
+
#
|
10
|
+
# => #<DaruLite::DataFrame(6x2)>
|
11
|
+
# a b
|
12
|
+
# 0 1 a
|
13
|
+
# 1 2 b
|
14
|
+
# 2 3 c
|
15
|
+
# 3 4 d
|
16
|
+
# 2 3 c
|
17
|
+
# 3 4 f
|
18
|
+
#
|
19
|
+
# 2.3.3 :> df.uniq
|
20
|
+
# => #<DaruLite::DataFrame(5x2)>
|
21
|
+
# a b
|
22
|
+
# 0 1 a
|
23
|
+
# 1 2 b
|
24
|
+
# 2 3 c
|
25
|
+
# 3 4 d
|
26
|
+
# 3 4 f
|
27
|
+
#
|
28
|
+
# 2.3.3 :> df.uniq(:a)
|
29
|
+
# => #<DaruLite::DataFrame(5x2)>
|
30
|
+
# a b
|
31
|
+
# 0 1 a
|
32
|
+
# 1 2 b
|
33
|
+
# 2 3 c
|
34
|
+
# 3 4 d
|
35
|
+
#
|
36
|
+
def uniq(*vtrs)
|
37
|
+
vecs = vtrs.empty? ? vectors.to_a : Array(vtrs)
|
38
|
+
grouped = group_by(vecs)
|
39
|
+
indexes = grouped.groups.values.map { |v| v[0] }.sort
|
40
|
+
row[*indexes]
|
41
|
+
end
|
42
|
+
|
43
|
+
# Retain vectors or rows if the block returns a truthy value.
|
44
|
+
#
|
45
|
+
# == Description
|
46
|
+
#
|
47
|
+
# For filtering out certain rows/vectors based on their values,
|
48
|
+
# use the #filter method. By default it iterates over vectors and
|
49
|
+
# keeps those vectors for which the block returns true. It accepts
|
50
|
+
# an optional axis argument which lets you specify whether you want
|
51
|
+
# to iterate over vectors or rows.
|
52
|
+
#
|
53
|
+
# == Arguments
|
54
|
+
#
|
55
|
+
# * +axis+ - The axis to map over. Can be :vector (or :column) or :row.
|
56
|
+
# Default to :vector.
|
57
|
+
#
|
58
|
+
# == Usage
|
59
|
+
#
|
60
|
+
# # Filter vectors
|
61
|
+
#
|
62
|
+
# df.filter do |vector|
|
63
|
+
# vector.type == :numeric and vector.median < 50
|
64
|
+
# end
|
65
|
+
#
|
66
|
+
# # Filter rows
|
67
|
+
#
|
68
|
+
# df.filter(:row) do |row|
|
69
|
+
# row[:a] + row[:d] < 100
|
70
|
+
# end
|
71
|
+
def filter(axis = :vector, &block)
|
72
|
+
dispatch_to_axis_pl axis, :filter, &block
|
73
|
+
end
|
74
|
+
|
75
|
+
# Returns a dataframe in which rows with any of the mentioned values
|
76
|
+
# are ignored.
|
77
|
+
# @param [Array] values to reject to form the new dataframe
|
78
|
+
# @return [DaruLite::DataFrame] Data Frame with only rows which doesn't
|
79
|
+
# contain the mentioned values
|
80
|
+
# @example
|
81
|
+
# df = DaruLite::DataFrame.new({
|
82
|
+
# a: [1, 2, 3, nil, Float::NAN, nil, 1, 7],
|
83
|
+
# b: [:a, :b, nil, Float::NAN, nil, 3, 5, 8],
|
84
|
+
# c: ['a', Float::NAN, 3, 4, 3, 5, nil, 7]
|
85
|
+
# }, index: 11..18)
|
86
|
+
# df.reject_values nil, Float::NAN
|
87
|
+
# # => #<DaruLite::DataFrame(2x3)>
|
88
|
+
# # a b c
|
89
|
+
# # 11 1 a a
|
90
|
+
# # 18 7 8 7
|
91
|
+
def reject_values(*values)
|
92
|
+
positions =
|
93
|
+
size.times.to_a - @data.flat_map { |vec| vec.positions(*values) }
|
94
|
+
# Handle the case when positions size is 1 and #row_at wouldn't return a df
|
95
|
+
if positions.size == 1
|
96
|
+
pos = positions.first
|
97
|
+
row_at(pos..pos)
|
98
|
+
else
|
99
|
+
row_at(*positions)
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
def keep_row_if
|
104
|
+
@index.size.times
|
105
|
+
.reject { |position| yield(row_at(position)) }
|
106
|
+
.reverse_each { |position| delete_at_position(position) }
|
107
|
+
end
|
108
|
+
|
109
|
+
def keep_vector_if
|
110
|
+
@vectors.each do |vector|
|
111
|
+
delete_vector(vector) unless yield(@data[@vectors[vector]], vector)
|
112
|
+
end
|
113
|
+
end
|
114
|
+
|
115
|
+
# creates a new vector with the data of a given field which the block returns true
|
116
|
+
def filter_vector(vec, &block)
|
117
|
+
DaruLite::Vector.new(each_row.select(&block).map { |row| row[vec] })
|
118
|
+
end
|
119
|
+
|
120
|
+
# Iterates over each row and retains it in a new DataFrame if the block returns
|
121
|
+
# true for that row.
|
122
|
+
def filter_rows
|
123
|
+
return to_enum(:filter_rows) unless block_given?
|
124
|
+
|
125
|
+
keep_rows = @index.map { |index| yield access_row(index) }
|
126
|
+
|
127
|
+
where keep_rows
|
128
|
+
end
|
129
|
+
|
130
|
+
# Iterates over each vector and retains it in a new DataFrame if the block returns
|
131
|
+
# true for that vector.
|
132
|
+
def filter_vectors(&block)
|
133
|
+
return to_enum(:filter_vectors) unless block
|
134
|
+
|
135
|
+
dup.tap { |df| df.keep_vector_if(&block) }
|
136
|
+
end
|
137
|
+
|
138
|
+
# Query a DataFrame by passing a DaruLite::Core::Query::BoolArray object.
|
139
|
+
def where(bool_array)
|
140
|
+
DaruLite::Core::Query.df_where self, bool_array
|
141
|
+
end
|
142
|
+
end
|
143
|
+
end
|
144
|
+
end
|
@@ -0,0 +1,179 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module IOAble
|
4
|
+
module ClassMethods
|
5
|
+
# Load data from a CSV file. Specify an optional block to grab the CSV
|
6
|
+
# object and pre-condition it (for example use the `convert` or
|
7
|
+
# `header_convert` methods).
|
8
|
+
#
|
9
|
+
# == Arguments
|
10
|
+
#
|
11
|
+
# * path - Local path / Remote URL of the file to load specified as a String.
|
12
|
+
#
|
13
|
+
# == Options
|
14
|
+
#
|
15
|
+
# Accepts the same options as the DaruLite::DataFrame constructor and CSV.open()
|
16
|
+
# and uses those to eventually construct the resulting DataFrame.
|
17
|
+
#
|
18
|
+
# == Verbose Description
|
19
|
+
#
|
20
|
+
# You can specify all the options to the `.from_csv` function that you
|
21
|
+
# do to the Ruby `CSV.read()` function, since this is what is used internally.
|
22
|
+
#
|
23
|
+
# For example, if the columns in your CSV file are separated by something
|
24
|
+
# other that commas, you can use the `:col_sep` option. If you want to
|
25
|
+
# convert numeric values to numbers and not keep them as strings, you can
|
26
|
+
# use the `:converters` option and set it to `:numeric`.
|
27
|
+
#
|
28
|
+
# The `.from_csv` function uses the following defaults for reading CSV files
|
29
|
+
# (that are passed into the `CSV.read()` function):
|
30
|
+
#
|
31
|
+
# {
|
32
|
+
# :col_sep => ',',
|
33
|
+
# :converters => :numeric
|
34
|
+
# }
|
35
|
+
def from_csv(path, opts = {}, &block)
|
36
|
+
DaruLite::IO.from_csv path, opts, &block
|
37
|
+
end
|
38
|
+
|
39
|
+
# Read data from an Excel file into a DataFrame.
|
40
|
+
#
|
41
|
+
# == Arguments
|
42
|
+
#
|
43
|
+
# * path - Path of the file to be read.
|
44
|
+
#
|
45
|
+
# == Options
|
46
|
+
#
|
47
|
+
# *:worksheet_id - ID of the worksheet that is to be read.
|
48
|
+
def from_excel(path, opts = {}, &block)
|
49
|
+
DaruLite::IO.from_excel path, opts, &block
|
50
|
+
end
|
51
|
+
|
52
|
+
# Read a database query and returns a Dataset
|
53
|
+
#
|
54
|
+
# @param dbh [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
|
55
|
+
# @param query [String] The query to be executed
|
56
|
+
#
|
57
|
+
# @return A dataframe containing the data resulting from the query
|
58
|
+
#
|
59
|
+
# USE:
|
60
|
+
#
|
61
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
62
|
+
# DaruLite::DataFrame.from_sql(dbh, "SELECT * FROM test")
|
63
|
+
#
|
64
|
+
# #Alternatively
|
65
|
+
#
|
66
|
+
# require 'dbi'
|
67
|
+
# DaruLite::DataFrame.from_sql("path/to/sqlite.db", "SELECT * FROM test")
|
68
|
+
def from_sql(dbh, query)
|
69
|
+
DaruLite::IO.from_sql dbh, query
|
70
|
+
end
|
71
|
+
|
72
|
+
# Read a dataframe from AR::Relation
|
73
|
+
#
|
74
|
+
# @param relation [ActiveRecord::Relation] An AR::Relation object from which data is loaded
|
75
|
+
# @param fields [Array] Field names to be loaded (optional)
|
76
|
+
#
|
77
|
+
# @return A dataframe containing the data loaded from the relation
|
78
|
+
#
|
79
|
+
# USE:
|
80
|
+
#
|
81
|
+
# # When Post model is defined as:
|
82
|
+
# class Post < ActiveRecord::Base
|
83
|
+
# scope :active, -> { where.not(published_at: nil) }
|
84
|
+
# end
|
85
|
+
#
|
86
|
+
# # You can load active posts into a dataframe by:
|
87
|
+
# DaruLite::DataFrame.from_activerecord(Post.active, :title, :published_at)
|
88
|
+
def from_activerecord(relation, *fields)
|
89
|
+
DaruLite::IO.from_activerecord relation, *fields
|
90
|
+
end
|
91
|
+
|
92
|
+
# Read the database from a plaintext file. For this method to work,
|
93
|
+
# the data should be present in a plain text file in columns. See
|
94
|
+
# spec/fixtures/bank2.dat for an example.
|
95
|
+
#
|
96
|
+
# == Arguments
|
97
|
+
#
|
98
|
+
# * path - Path of the file to be read.
|
99
|
+
# * fields - Vector names of the resulting database.
|
100
|
+
#
|
101
|
+
# == Usage
|
102
|
+
#
|
103
|
+
# df = DaruLite::DataFrame.from_plaintext 'spec/fixtures/bank2.dat', [:v1,:v2,:v3,:v4,:v5,:v6]
|
104
|
+
def from_plaintext(path, fields)
|
105
|
+
DaruLite::IO.from_plaintext path, fields
|
106
|
+
end
|
107
|
+
|
108
|
+
def _load(data)
|
109
|
+
h = Marshal.load data
|
110
|
+
DaruLite::DataFrame.new(
|
111
|
+
h[:data],
|
112
|
+
index: h[:index],
|
113
|
+
order: h[:order],
|
114
|
+
name: h[:name]
|
115
|
+
)
|
116
|
+
end
|
117
|
+
end
|
118
|
+
|
119
|
+
def self.included(base)
|
120
|
+
base.extend ClassMethods
|
121
|
+
end
|
122
|
+
|
123
|
+
# Write this DataFrame to a CSV file.
|
124
|
+
#
|
125
|
+
# == Arguments
|
126
|
+
#
|
127
|
+
# * filename - Path of CSV file where the DataFrame is to be saved.
|
128
|
+
#
|
129
|
+
# == Options
|
130
|
+
#
|
131
|
+
# * convert_comma - If set to *true*, will convert any commas in any
|
132
|
+
# of the data to full stops ('.').
|
133
|
+
# All the options accepted by CSV.read() can also be passed into this
|
134
|
+
# function.
|
135
|
+
def write_csv(filename, opts = {})
|
136
|
+
DaruLite::IO.dataframe_write_csv self, filename, opts
|
137
|
+
end
|
138
|
+
|
139
|
+
# Write this dataframe to an Excel Spreadsheet
|
140
|
+
#
|
141
|
+
# == Arguments
|
142
|
+
#
|
143
|
+
# * filename - The path of the file where the DataFrame should be written.
|
144
|
+
def write_excel(filename, opts = {})
|
145
|
+
DaruLite::IO.dataframe_write_excel self, filename, opts
|
146
|
+
end
|
147
|
+
|
148
|
+
# Insert each case of the Dataset on the selected table
|
149
|
+
#
|
150
|
+
# == Arguments
|
151
|
+
#
|
152
|
+
# * dbh - DBI database connection object.
|
153
|
+
# * query - Query string.
|
154
|
+
#
|
155
|
+
# == Usage
|
156
|
+
#
|
157
|
+
# ds = DaruLite::DataFrame.new({:id=>DaruLite::Vector.new([1,2,3]), :name=>DaruLite::Vector.new(["a","b","c"])})
|
158
|
+
# dbh = DBI.connect("DBI:Mysql:database:localhost", "user", "password")
|
159
|
+
# ds.write_sql(dbh,"test")
|
160
|
+
def write_sql(dbh, table)
|
161
|
+
DaruLite::IO.dataframe_write_sql self, dbh, table
|
162
|
+
end
|
163
|
+
|
164
|
+
# Use marshalling to save dataframe to a file.
|
165
|
+
def save(filename)
|
166
|
+
DaruLite::IO.save self, filename
|
167
|
+
end
|
168
|
+
|
169
|
+
def _dump(_depth)
|
170
|
+
Marshal.dump(
|
171
|
+
data: @data,
|
172
|
+
index: @index.to_a,
|
173
|
+
order: @vectors.to_a,
|
174
|
+
name: @name
|
175
|
+
)
|
176
|
+
end
|
177
|
+
end
|
178
|
+
end
|
179
|
+
end
|