daru_lite 0.1 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.github/ISSUE_TEMPLATE/bug_report.md +38 -0
- data/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- data/.github/workflows/ci.yml +20 -0
- data/.rubocop_todo.yml +35 -33
- data/README.md +19 -115
- data/daru_lite.gemspec +1 -0
- data/lib/daru_lite/data_frame/aggregatable.rb +165 -0
- data/lib/daru_lite/data_frame/calculatable.rb +140 -0
- data/lib/daru_lite/data_frame/convertible.rb +107 -0
- data/lib/daru_lite/data_frame/duplicatable.rb +64 -0
- data/lib/daru_lite/data_frame/fetchable.rb +301 -0
- data/lib/daru_lite/data_frame/filterable.rb +144 -0
- data/lib/daru_lite/data_frame/i_o_able.rb +179 -0
- data/lib/daru_lite/data_frame/indexable.rb +168 -0
- data/lib/daru_lite/data_frame/iterable.rb +339 -0
- data/lib/daru_lite/data_frame/joinable.rb +152 -0
- data/lib/daru_lite/data_frame/missable.rb +75 -0
- data/lib/daru_lite/data_frame/pivotable.rb +108 -0
- data/lib/daru_lite/data_frame/queryable.rb +67 -0
- data/lib/daru_lite/data_frame/setable.rb +109 -0
- data/lib/daru_lite/data_frame/sortable.rb +241 -0
- data/lib/daru_lite/dataframe.rb +142 -2355
- data/lib/daru_lite/index/index.rb +13 -0
- data/lib/daru_lite/maths/statistics/vector.rb +1 -1
- data/lib/daru_lite/vector/aggregatable.rb +9 -0
- data/lib/daru_lite/vector/calculatable.rb +78 -0
- data/lib/daru_lite/vector/convertible.rb +77 -0
- data/lib/daru_lite/vector/duplicatable.rb +17 -0
- data/lib/daru_lite/vector/fetchable.rb +175 -0
- data/lib/daru_lite/vector/filterable.rb +128 -0
- data/lib/daru_lite/vector/indexable.rb +77 -0
- data/lib/daru_lite/vector/iterable.rb +95 -0
- data/lib/daru_lite/vector/joinable.rb +17 -0
- data/lib/daru_lite/vector/missable.rb +124 -0
- data/lib/daru_lite/vector/queryable.rb +45 -0
- data/lib/daru_lite/vector/setable.rb +47 -0
- data/lib/daru_lite/vector/sortable.rb +113 -0
- data/lib/daru_lite/vector.rb +36 -932
- data/lib/daru_lite/version.rb +1 -1
- data/spec/data_frame/aggregatable_example.rb +65 -0
- data/spec/data_frame/buildable_example.rb +109 -0
- data/spec/data_frame/calculatable_example.rb +135 -0
- data/spec/data_frame/convertible_example.rb +180 -0
- data/spec/data_frame/duplicatable_example.rb +111 -0
- data/spec/data_frame/fetchable_example.rb +476 -0
- data/spec/data_frame/filterable_example.rb +250 -0
- data/spec/data_frame/indexable_example.rb +221 -0
- data/spec/data_frame/iterable_example.rb +465 -0
- data/spec/data_frame/joinable_example.rb +106 -0
- data/spec/data_frame/missable_example.rb +47 -0
- data/spec/data_frame/pivotable_example.rb +297 -0
- data/spec/data_frame/queryable_example.rb +92 -0
- data/spec/data_frame/setable_example.rb +482 -0
- data/spec/data_frame/sortable_example.rb +350 -0
- data/spec/dataframe_spec.rb +181 -3243
- data/spec/index/index_spec.rb +8 -0
- data/spec/vector/aggregatable_example.rb +27 -0
- data/spec/vector/calculatable_example.rb +82 -0
- data/spec/vector/convertible_example.rb +126 -0
- data/spec/vector/duplicatable_example.rb +48 -0
- data/spec/vector/fetchable_example.rb +463 -0
- data/spec/vector/filterable_example.rb +165 -0
- data/spec/vector/indexable_example.rb +201 -0
- data/spec/vector/iterable_example.rb +111 -0
- data/spec/vector/joinable_example.rb +25 -0
- data/spec/vector/missable_example.rb +88 -0
- data/spec/vector/queryable_example.rb +91 -0
- data/spec/vector/setable_example.rb +300 -0
- data/spec/vector/sortable_example.rb +242 -0
- data/spec/vector_spec.rb +111 -1805
- metadata +102 -3
- data/.github/ISSUE_TEMPLATE.md +0 -18
@@ -0,0 +1,140 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Calculatable
|
4
|
+
# Sum all numeric/specified vectors in the DataFrame.
|
5
|
+
#
|
6
|
+
# Returns a new vector that's a containing a sum of all numeric
|
7
|
+
# or specified vectors of the DataFrame. By default, if the vector
|
8
|
+
# contains a nil, the sum is nil.
|
9
|
+
# With :skipnil argument set to true, nil values are assumed to be
|
10
|
+
# 0 (zero) and the sum vector is returned.
|
11
|
+
#
|
12
|
+
# @param args [Array] List of vectors to sum. Default is nil in which case
|
13
|
+
# all numeric vectors are summed.
|
14
|
+
#
|
15
|
+
# @option opts [Boolean] :skipnil Consider nils as 0. Default is false.
|
16
|
+
#
|
17
|
+
# @return Vector with sum of all vectors specified in the argument.
|
18
|
+
# If vecs parameter is empty, sum all numeric vector.
|
19
|
+
#
|
20
|
+
# @example
|
21
|
+
# df = DaruLite::DataFrame.new({
|
22
|
+
# a: [1, 2, nil],
|
23
|
+
# b: [2, 1, 3],
|
24
|
+
# c: [1, 1, 1]
|
25
|
+
# })
|
26
|
+
# => #<DaruLite::DataFrame(3x3)>
|
27
|
+
# a b c
|
28
|
+
# 0 1 2 1
|
29
|
+
# 1 2 1 1
|
30
|
+
# 2 nil 3 1
|
31
|
+
# df.vector_sum [:a, :c]
|
32
|
+
# => #<DaruLite::Vector(3)>
|
33
|
+
# 0 2
|
34
|
+
# 1 3
|
35
|
+
# 2 nil
|
36
|
+
# df.vector_sum
|
37
|
+
# => #<DaruLite::Vector(3)>
|
38
|
+
# 0 4
|
39
|
+
# 1 4
|
40
|
+
# 2 nil
|
41
|
+
# df.vector_sum skipnil: true
|
42
|
+
# => #<DaruLite::Vector(3)>
|
43
|
+
# c
|
44
|
+
# 0 4
|
45
|
+
# 1 4
|
46
|
+
# 2 4
|
47
|
+
#
|
48
|
+
def vector_sum(*args)
|
49
|
+
defaults = { vecs: nil, skipnil: false }
|
50
|
+
options = args.last.is_a?(::Hash) ? args.pop : {}
|
51
|
+
options = defaults.merge(options)
|
52
|
+
vecs = args[0] || options[:vecs]
|
53
|
+
skipnil = args[1] || options[:skipnil]
|
54
|
+
|
55
|
+
vecs ||= numeric_vectors
|
56
|
+
sum = DaruLite::Vector.new [0] * @size, index: @index, name: @name, dtype: @dtype
|
57
|
+
vecs.inject(sum) { |memo, n| self[n].add(memo, skipnil: skipnil) }
|
58
|
+
end
|
59
|
+
|
60
|
+
# Calculate mean of the rows of the dataframe.
|
61
|
+
#
|
62
|
+
# == Arguments
|
63
|
+
#
|
64
|
+
# * +max_missing+ - The maximum number of elements in the row that can be
|
65
|
+
# zero for the mean calculation to happen. Default to 0.
|
66
|
+
def vector_mean(max_missing = 0)
|
67
|
+
# FIXME: in vector_sum we preserve created vector dtype, but
|
68
|
+
# here we are not. Is this by design or ...? - zverok, 2016-05-18
|
69
|
+
mean_vec = DaruLite::Vector.new [0] * @size, index: @index, name: "mean_#{@name}"
|
70
|
+
|
71
|
+
each_row_with_index.with_object(mean_vec) do |(row, i), memo|
|
72
|
+
memo[i] = row.indexes(*DaruLite::MISSING_VALUES).size > max_missing ? nil : row.mean
|
73
|
+
end
|
74
|
+
end
|
75
|
+
|
76
|
+
# Returns a vector, based on a string with a calculation based
|
77
|
+
# on vector.
|
78
|
+
#
|
79
|
+
# The calculation will be eval'ed, so you can put any variable
|
80
|
+
# or expression valid on ruby.
|
81
|
+
#
|
82
|
+
# For example:
|
83
|
+
# a = DaruLite::Vector.new [1,2]
|
84
|
+
# b = DaruLite::Vector.new [3,4]
|
85
|
+
# ds = DaruLite::DataFrame.new({:a => a,:b => b})
|
86
|
+
# ds.compute("a+b")
|
87
|
+
# => Vector [4,6]
|
88
|
+
def compute(text, &block)
|
89
|
+
return instance_eval(&block) if block
|
90
|
+
|
91
|
+
instance_eval(text)
|
92
|
+
end
|
93
|
+
|
94
|
+
# DSL for yielding each row and returning a DaruLite::Vector based on the
|
95
|
+
# value each run of the block returns.
|
96
|
+
#
|
97
|
+
# == Usage
|
98
|
+
#
|
99
|
+
# a1 = DaruLite::Vector.new([1, 2, 3, 4, 5, 6, 7])
|
100
|
+
# a2 = DaruLite::Vector.new([10, 20, 30, 40, 50, 60, 70])
|
101
|
+
# a3 = DaruLite::Vector.new([100, 200, 300, 400, 500, 600, 700])
|
102
|
+
# ds = DaruLite::DataFrame.new({ :a => a1, :b => a2, :c => a3 })
|
103
|
+
# total = ds.vector_by_calculation { a + b + c }
|
104
|
+
# # <DaruLite::Vector:82314050 @name = nil @size = 7 >
|
105
|
+
# # nil
|
106
|
+
# # 0 111
|
107
|
+
# # 1 222
|
108
|
+
# # 2 333
|
109
|
+
# # 3 444
|
110
|
+
# # 4 555
|
111
|
+
# # 5 666
|
112
|
+
# # 6 777
|
113
|
+
def vector_by_calculation(&block)
|
114
|
+
a = each_row.map { |r| r.instance_eval(&block) }
|
115
|
+
|
116
|
+
DaruLite::Vector.new a, index: @index
|
117
|
+
end
|
118
|
+
|
119
|
+
def vector_count_characters(vecs = nil)
|
120
|
+
vecs ||= @vectors.to_a
|
121
|
+
|
122
|
+
collect_rows do |row|
|
123
|
+
vecs.sum { |v| row[v].to_s.size }
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Generate a summary of this DataFrame based on individual vectors in the DataFrame
|
128
|
+
# @return [String] String containing the summary of the DataFrame
|
129
|
+
def summary
|
130
|
+
summary = "= #{name}"
|
131
|
+
summary << "\n Number of rows: #{nrows}"
|
132
|
+
@vectors.each do |v|
|
133
|
+
summary << "\n Element:[#{v}]\n"
|
134
|
+
summary << self[v].summary(1)
|
135
|
+
end
|
136
|
+
summary
|
137
|
+
end
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
@@ -0,0 +1,107 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Convertible
|
4
|
+
# Create a sql, basen on a given Dataset
|
5
|
+
#
|
6
|
+
# == Arguments
|
7
|
+
#
|
8
|
+
# * table - String specifying name of the table that will created in SQL.
|
9
|
+
# * charset - Character set. Default is "UTF8".
|
10
|
+
#
|
11
|
+
# @example
|
12
|
+
#
|
13
|
+
# ds = DaruLite::DataFrame.new({
|
14
|
+
# :id => DaruLite::Vector.new([1,2,3,4,5]),
|
15
|
+
# :name => DaruLite::Vector.new(%w{Alex Peter Susan Mary John})
|
16
|
+
# })
|
17
|
+
# ds.create_sql('names')
|
18
|
+
# #=>"CREATE TABLE names (id INTEGER,\n name VARCHAR (255)) CHARACTER SET=UTF8;"
|
19
|
+
#
|
20
|
+
def create_sql(table, charset = 'UTF8')
|
21
|
+
sql = "CREATE TABLE #{table} ("
|
22
|
+
fields = vectors.to_a.collect do |f|
|
23
|
+
v = self[f]
|
24
|
+
"#{f} #{v.db_type}"
|
25
|
+
end
|
26
|
+
|
27
|
+
sql + fields.join(",\n ") + ") CHARACTER SET=#{charset};"
|
28
|
+
end
|
29
|
+
|
30
|
+
# Returns the dataframe. This can be convenient when the user does not
|
31
|
+
# know whether the object is a vector or a dataframe.
|
32
|
+
# @return [self] the dataframe
|
33
|
+
def to_df
|
34
|
+
self
|
35
|
+
end
|
36
|
+
|
37
|
+
# Convert all vectors of type *:numeric* into a Matrix.
|
38
|
+
def to_matrix
|
39
|
+
Matrix.columns each_vector.select(&:numeric?).map(&:to_a)
|
40
|
+
end
|
41
|
+
|
42
|
+
# Converts the DataFrame into an array of hashes where key is vector name
|
43
|
+
# and value is the corresponding element. The 0th index of the array contains
|
44
|
+
# the array of hashes while the 1th index contains the indexes of each row
|
45
|
+
# of the dataframe. Each element in the index array corresponds to its row
|
46
|
+
# in the array of hashes, which has the same index.
|
47
|
+
def to_a
|
48
|
+
[each_row.map(&:to_h), @index.to_a]
|
49
|
+
end
|
50
|
+
|
51
|
+
# Convert to json. If no_index is false then the index will NOT be included
|
52
|
+
# in the JSON thus created.
|
53
|
+
def to_json(no_index = true)
|
54
|
+
if no_index
|
55
|
+
to_a[0].to_json
|
56
|
+
else
|
57
|
+
to_a.to_json
|
58
|
+
end
|
59
|
+
end
|
60
|
+
|
61
|
+
# Converts DataFrame to a hash (explicit) with keys as vector names and values as
|
62
|
+
# the corresponding vectors.
|
63
|
+
def to_h
|
64
|
+
@vectors
|
65
|
+
.each_with_index
|
66
|
+
.map { |vec_name, idx| [vec_name, @data[idx]] }.to_h
|
67
|
+
end
|
68
|
+
|
69
|
+
# Convert to html for IRuby.
|
70
|
+
def to_html(threshold = DaruLite.max_rows)
|
71
|
+
table_thead = to_html_thead
|
72
|
+
table_tbody = to_html_tbody(threshold)
|
73
|
+
path = if index.is_a?(MultiIndex)
|
74
|
+
File.expand_path('../iruby/templates/dataframe_mi.html.erb', __dir__)
|
75
|
+
else
|
76
|
+
File.expand_path('../iruby/templates/dataframe.html.erb', __dir__)
|
77
|
+
end
|
78
|
+
ERB.new(File.read(path).strip).result(binding)
|
79
|
+
end
|
80
|
+
|
81
|
+
def to_html_thead
|
82
|
+
table_thead_path =
|
83
|
+
if index.is_a?(MultiIndex)
|
84
|
+
File.expand_path('../iruby/templates/dataframe_mi_thead.html.erb', __dir__)
|
85
|
+
else
|
86
|
+
File.expand_path('../iruby/templates/dataframe_thead.html.erb', __dir__)
|
87
|
+
end
|
88
|
+
ERB.new(File.read(table_thead_path).strip).result(binding)
|
89
|
+
end
|
90
|
+
|
91
|
+
def to_html_tbody(threshold = DaruLite.max_rows)
|
92
|
+
threshold ||= @size
|
93
|
+
table_tbody_path =
|
94
|
+
if index.is_a?(MultiIndex)
|
95
|
+
File.expand_path('../iruby/templates/dataframe_mi_tbody.html.erb', __dir__)
|
96
|
+
else
|
97
|
+
File.expand_path('../iruby/templates/dataframe_tbody.html.erb', __dir__)
|
98
|
+
end
|
99
|
+
ERB.new(File.read(table_tbody_path).strip).result(binding)
|
100
|
+
end
|
101
|
+
|
102
|
+
def to_s
|
103
|
+
"#<#{self.class}#{": #{@name}" if @name}(#{nrows}x#{ncols})>"
|
104
|
+
end
|
105
|
+
end
|
106
|
+
end
|
107
|
+
end
|
@@ -0,0 +1,64 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Duplicatable
|
4
|
+
extend Gem::Deprecate
|
5
|
+
|
6
|
+
# Duplicate the DataFrame entirely.
|
7
|
+
#
|
8
|
+
# == Arguments
|
9
|
+
#
|
10
|
+
# * +vectors_to_dup+ - An Array specifying the names of Vectors to
|
11
|
+
# be duplicated. Will duplicate the entire DataFrame if not specified.
|
12
|
+
def dup(vectors_to_dup = nil)
|
13
|
+
vectors_to_dup ||= @vectors.to_a
|
14
|
+
|
15
|
+
src = vectors_to_dup.map { |vec| @data[@vectors.pos(vec)].dup }
|
16
|
+
new_order = DaruLite::Index.new(vectors_to_dup)
|
17
|
+
|
18
|
+
DaruLite::DataFrame.new src, order: new_order, index: @index.dup, name: @name, clone: true
|
19
|
+
end
|
20
|
+
|
21
|
+
# Only clone the structure of the DataFrame.
|
22
|
+
def clone_structure
|
23
|
+
DaruLite::DataFrame.new([], order: @vectors.dup, index: @index.dup, name: @name)
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns a 'view' of the DataFrame, i.e the object ID's of vectors are
|
27
|
+
# preserved.
|
28
|
+
#
|
29
|
+
# == Arguments
|
30
|
+
#
|
31
|
+
# +vectors_to_clone+ - Names of vectors to clone. Optional. Will return
|
32
|
+
# a view of the whole data frame otherwise.
|
33
|
+
def clone(*vectors_to_clone)
|
34
|
+
vectors_to_clone.flatten! if ArrayHelper.array_of?(vectors_to_clone, Array)
|
35
|
+
vectors_to_clone = @vectors.to_a if vectors_to_clone.empty?
|
36
|
+
|
37
|
+
h = vectors_to_clone.map { |vec| [vec, self[vec]] }.to_h
|
38
|
+
DaruLite::DataFrame.new(h, clone: false, order: vectors_to_clone, name: @name)
|
39
|
+
end
|
40
|
+
|
41
|
+
# Returns a 'shallow' copy of DataFrame if missing data is not present,
|
42
|
+
# or a full copy of only valid data if missing data is present.
|
43
|
+
def clone_only_valid
|
44
|
+
if include_values?(*DaruLite::MISSING_VALUES)
|
45
|
+
reject_values(*DaruLite::MISSING_VALUES)
|
46
|
+
else
|
47
|
+
clone
|
48
|
+
end
|
49
|
+
end
|
50
|
+
|
51
|
+
# Creates a new duplicate dataframe containing only rows
|
52
|
+
# without a single missing value.
|
53
|
+
def dup_only_valid(vecs = nil)
|
54
|
+
rows_with_nil = @data.map { |vec| vec.indexes(*DaruLite::MISSING_VALUES) }
|
55
|
+
.inject(&:concat)
|
56
|
+
.uniq
|
57
|
+
|
58
|
+
row_indexes = @index.to_a
|
59
|
+
(vecs.nil? ? self : dup(vecs)).row[*(row_indexes - rows_with_nil)]
|
60
|
+
end
|
61
|
+
deprecate :dup_only_valid, :reject_values, 2016, 10
|
62
|
+
end
|
63
|
+
end
|
64
|
+
end
|
@@ -0,0 +1,301 @@
|
|
1
|
+
module DaruLite
|
2
|
+
class DataFrame
|
3
|
+
module Fetchable
|
4
|
+
# Access row or vector. Specify name of row/vector followed by axis(:row, :vector).
|
5
|
+
# Defaults to *:vector*. Use of this method is not recommended for accessing
|
6
|
+
# rows. Use df.row[:a] for accessing row with index ':a'.
|
7
|
+
def [](*names)
|
8
|
+
axis = extract_axis(names, :vector)
|
9
|
+
dispatch_to_axis axis, :access, *names
|
10
|
+
end
|
11
|
+
|
12
|
+
# Retrive rows by positions
|
13
|
+
# @param [Array<Integer>] positions of rows to retrive
|
14
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
15
|
+
# @example
|
16
|
+
# df = DaruLite::DataFrame.new({
|
17
|
+
# a: [1, 2, 3],
|
18
|
+
# b: ['a', 'b', 'c']
|
19
|
+
# })
|
20
|
+
# df.row_at 1, 2
|
21
|
+
# # => #<DaruLite::DataFrame(2x2)>
|
22
|
+
# # a b
|
23
|
+
# # 1 2 b
|
24
|
+
# # 2 3 c
|
25
|
+
def row_at(*positions)
|
26
|
+
original_positions = positions
|
27
|
+
positions = coerce_positions(*positions, nrows)
|
28
|
+
validate_positions(*positions, nrows)
|
29
|
+
|
30
|
+
if positions.is_a? Integer
|
31
|
+
row = get_rows_for([positions])
|
32
|
+
DaruLite::Vector.new(row, index: @vectors, name: @index.at(positions))
|
33
|
+
else
|
34
|
+
new_rows = get_rows_for(original_positions)
|
35
|
+
DaruLite::DataFrame.new(
|
36
|
+
new_rows,
|
37
|
+
index: @index.at(*original_positions),
|
38
|
+
order: @vectors,
|
39
|
+
name: @name
|
40
|
+
)
|
41
|
+
end
|
42
|
+
end
|
43
|
+
|
44
|
+
# Retrive vectors by positions
|
45
|
+
# @param [Array<Integer>] positions of vectors to retrive
|
46
|
+
# @return [DaruLite::Vector, DaruLite::DataFrame] vector for single position and dataframe for multiple positions
|
47
|
+
# @example
|
48
|
+
# df = DaruLite::DataFrame.new({
|
49
|
+
# a: [1, 2, 3],
|
50
|
+
# b: ['a', 'b', 'c']
|
51
|
+
# })
|
52
|
+
# df.at 0
|
53
|
+
# # => #<DaruLite::Vector(3)>
|
54
|
+
# # a
|
55
|
+
# # 0 1
|
56
|
+
# # 1 2
|
57
|
+
# # 2 3
|
58
|
+
def at(*positions)
|
59
|
+
if AXES.include? positions.last
|
60
|
+
axis = positions.pop
|
61
|
+
return row_at(*positions) if axis == :row
|
62
|
+
end
|
63
|
+
|
64
|
+
original_positions = positions
|
65
|
+
positions = coerce_positions(*positions, ncols)
|
66
|
+
validate_positions(*positions, ncols)
|
67
|
+
|
68
|
+
if positions.is_a? Integer
|
69
|
+
@data[positions].dup
|
70
|
+
else
|
71
|
+
DaruLite::DataFrame.new positions.map { |pos| @data[pos].dup },
|
72
|
+
index: @index,
|
73
|
+
order: @vectors.at(*original_positions),
|
74
|
+
name: @name
|
75
|
+
end
|
76
|
+
end
|
77
|
+
|
78
|
+
# The first ten elements of the DataFrame
|
79
|
+
#
|
80
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the top.
|
81
|
+
def head(quantity = 10)
|
82
|
+
row.at 0..(quantity - 1)
|
83
|
+
end
|
84
|
+
alias first head
|
85
|
+
|
86
|
+
# The last ten elements of the DataFrame
|
87
|
+
#
|
88
|
+
# @param [Fixnum] quantity (10) The number of elements to display from the bottom.
|
89
|
+
def tail(quantity = 10)
|
90
|
+
start = [-quantity, -size].max
|
91
|
+
row.at start..-1
|
92
|
+
end
|
93
|
+
alias last tail
|
94
|
+
|
95
|
+
# Extract a dataframe given row indexes or positions
|
96
|
+
# @param keys [Array] can be positions (if by_position is true) or indexes (if by_position if false)
|
97
|
+
# @return [DaruLite::Dataframe]
|
98
|
+
def get_sub_dataframe(keys, by_position: true)
|
99
|
+
return DaruLite::DataFrame.new({}) if keys == []
|
100
|
+
|
101
|
+
keys = @index.pos(*keys) unless by_position
|
102
|
+
|
103
|
+
sub_df = row_at(*keys)
|
104
|
+
sub_df = sub_df.to_df.transpose if sub_df.is_a?(DaruLite::Vector)
|
105
|
+
|
106
|
+
sub_df
|
107
|
+
end
|
108
|
+
|
109
|
+
def get_vector_anyways(v)
|
110
|
+
@vectors.include?(v) ? self[v].to_a : Array.new(size)
|
111
|
+
end
|
112
|
+
|
113
|
+
# @param indexes [Array] index(s) at which row tuples are retrieved
|
114
|
+
# @return [Array] returns array of row tuples at given index(s)
|
115
|
+
# @example Using DaruLite::Index
|
116
|
+
# df = DaruLite::DataFrame.new({
|
117
|
+
# a: [1, 2, 3],
|
118
|
+
# b: ['a', 'a', 'b']
|
119
|
+
# })
|
120
|
+
#
|
121
|
+
# df.access_row_tuples_by_indexs(1,2)
|
122
|
+
# # => [[2, "a"], [3, "b"]]
|
123
|
+
#
|
124
|
+
# df.index = DaruLite::Index.new([:one,:two,:three])
|
125
|
+
# df.access_row_tuples_by_indexs(:one,:three)
|
126
|
+
# # => [[1, "a"], [3, "b"]]
|
127
|
+
#
|
128
|
+
# @example Using DaruLite::MultiIndex
|
129
|
+
# mi_idx = DaruLite::MultiIndex.from_tuples [
|
130
|
+
# [:a,:one,:bar],
|
131
|
+
# [:a,:one,:baz],
|
132
|
+
# [:b,:two,:bar],
|
133
|
+
# [:a,:two,:baz],
|
134
|
+
# ]
|
135
|
+
# df_mi = DaruLite::DataFrame.new({
|
136
|
+
# a: 1..4,
|
137
|
+
# b: 'a'..'d'
|
138
|
+
# }, index: mi_idx )
|
139
|
+
#
|
140
|
+
# df_mi.access_row_tuples_by_indexs(:b, :two, :bar)
|
141
|
+
# # => [[3, "c"]]
|
142
|
+
# df_mi.access_row_tuples_by_indexs(:a)
|
143
|
+
# # => [[1, "a"], [2, "b"], [4, "d"]]
|
144
|
+
def access_row_tuples_by_indexs(*indexes)
|
145
|
+
return get_sub_dataframe(indexes, by_position: false).map_rows(&:to_a) if
|
146
|
+
@index.is_a?(DaruLite::MultiIndex)
|
147
|
+
|
148
|
+
positions = @index.pos(*indexes)
|
149
|
+
if positions.is_a? Numeric
|
150
|
+
row = get_rows_for([positions])
|
151
|
+
row.first.is_a?(Array) ? row : [row]
|
152
|
+
else
|
153
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
154
|
+
indexes.map { |index| new_rows.map { |r| r[index] } }
|
155
|
+
end
|
156
|
+
end
|
157
|
+
|
158
|
+
# Split the dataframe into many dataframes based on category vector
|
159
|
+
# @param [object] cat_name name of category vector to split the dataframe
|
160
|
+
# @return [Array] array of dataframes split by category with category vector
|
161
|
+
# used to split not included
|
162
|
+
# @example
|
163
|
+
# df = DaruLite::DataFrame.new({
|
164
|
+
# a: [1, 2, 3],
|
165
|
+
# b: ['a', 'a', 'b']
|
166
|
+
# })
|
167
|
+
# df.to_category :b
|
168
|
+
# df.split_by_category :b
|
169
|
+
# # => [#<DaruLite::DataFrame: a (2x1)>
|
170
|
+
# # a
|
171
|
+
# # 0 1
|
172
|
+
# # 1 2,
|
173
|
+
# # #<DaruLite::DataFrame: b (1x1)>
|
174
|
+
# # a
|
175
|
+
# # 2 3]
|
176
|
+
def split_by_category(cat_name)
|
177
|
+
cat_dv = self[cat_name]
|
178
|
+
raise ArgumentError, "#{cat_name} is not a category vector" unless
|
179
|
+
cat_dv.category?
|
180
|
+
|
181
|
+
cat_dv.categories.map do |cat|
|
182
|
+
where(cat_dv.eq cat)
|
183
|
+
.rename(cat)
|
184
|
+
.delete_vector cat_name
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
# Return the indexes of all the numeric vectors. Will include vectors with nils
|
189
|
+
# alongwith numbers.
|
190
|
+
def numeric_vectors
|
191
|
+
# FIXME: Why _with_index ?..
|
192
|
+
each_vector_with_index
|
193
|
+
.select { |vec, _i| vec.numeric? }
|
194
|
+
.map(&:last)
|
195
|
+
end
|
196
|
+
|
197
|
+
def numeric_vector_names
|
198
|
+
@vectors.select { |v| self[v].numeric? }
|
199
|
+
end
|
200
|
+
|
201
|
+
# Return a DataFrame of only the numerical Vectors. If clone: false
|
202
|
+
# is specified as option, only a *view* of the Vectors will be
|
203
|
+
# returned. Defaults to clone: true.
|
204
|
+
def only_numerics(opts = {})
|
205
|
+
cln = opts[:clone] != false
|
206
|
+
arry = numeric_vectors.map { |v| self[v] }
|
207
|
+
|
208
|
+
order = Index.new(numeric_vectors)
|
209
|
+
DaruLite::DataFrame.new(arry, clone: cln, order: order, index: @index)
|
210
|
+
end
|
211
|
+
|
212
|
+
private
|
213
|
+
|
214
|
+
def access_vector(*names)
|
215
|
+
if names.first.is_a?(Range)
|
216
|
+
dup(@vectors.subset(names.first))
|
217
|
+
elsif @vectors.is_a?(MultiIndex)
|
218
|
+
access_vector_multi_index(*names)
|
219
|
+
else
|
220
|
+
access_vector_single_index(*names)
|
221
|
+
end
|
222
|
+
end
|
223
|
+
|
224
|
+
def access_vector_multi_index(*names)
|
225
|
+
pos = @vectors[names]
|
226
|
+
|
227
|
+
return @data[pos] if pos.is_a?(Integer)
|
228
|
+
|
229
|
+
new_vectors = pos.map { |tuple| @data[@vectors[tuple]] }
|
230
|
+
|
231
|
+
pos = pos.drop_left_level(names.size) if names.size < @vectors.width
|
232
|
+
|
233
|
+
DaruLite::DataFrame.new(new_vectors, index: @index, order: pos)
|
234
|
+
end
|
235
|
+
|
236
|
+
def access_vector_single_index(*names)
|
237
|
+
if names.count < 2
|
238
|
+
begin
|
239
|
+
pos = @vectors.is_a?(DaruLite::DateTimeIndex) ? @vectors[names.first] : @vectors.pos(names.first)
|
240
|
+
rescue IndexError
|
241
|
+
raise IndexError, "Specified vector #{names.first} does not exist"
|
242
|
+
end
|
243
|
+
return @data[pos] if pos.is_a?(Numeric)
|
244
|
+
|
245
|
+
names = pos
|
246
|
+
end
|
247
|
+
|
248
|
+
new_vectors = names.map { |name| [name, @data[@vectors.pos(name)]] }.to_h
|
249
|
+
|
250
|
+
order = names.is_a?(Array) ? DaruLite::Index.new(names) : names
|
251
|
+
DaruLite::DataFrame.new(new_vectors, order: order, index: @index, name: @name)
|
252
|
+
end
|
253
|
+
|
254
|
+
def access_row(*indexes)
|
255
|
+
positions = @index.pos(*indexes)
|
256
|
+
|
257
|
+
if positions.is_a? Numeric
|
258
|
+
row = get_rows_for([positions])
|
259
|
+
DaruLite::Vector.new row, index: @vectors, name: indexes.first
|
260
|
+
else
|
261
|
+
new_rows = get_rows_for(indexes, by_position: false)
|
262
|
+
DaruLite::DataFrame.new new_rows, index: @index.subset(*indexes), order: @vectors
|
263
|
+
end
|
264
|
+
end
|
265
|
+
|
266
|
+
# @param keys [Array] can be an array of positions (if by_position is true) or indexes (if by_position if false)
|
267
|
+
# because of coercion by DaruLite::Vector#at and DaruLite::Vector#[], can return either an Array of
|
268
|
+
# values (representing a row) or an array of Vectors (that can be seen as rows)
|
269
|
+
def get_rows_for(keys, by_position: true)
|
270
|
+
raise unless keys.is_a?(Array)
|
271
|
+
|
272
|
+
if by_position
|
273
|
+
pos = keys
|
274
|
+
@data.map { |vector| vector.at(*pos) }
|
275
|
+
else
|
276
|
+
# TODO: for now (2018-07-27), it is different than using
|
277
|
+
# get_rows_for(@index.pos(*keys))
|
278
|
+
# because DaruLite::Vector#at and DaruLite::Vector#[] don't handle DaruLite::MultiIndex the same way
|
279
|
+
indexes = keys
|
280
|
+
@data.map { |vec| vec[*indexes] }
|
281
|
+
end
|
282
|
+
end
|
283
|
+
|
284
|
+
# coerce ranges, integers and array in appropriate ways
|
285
|
+
def coerce_positions(*positions, size)
|
286
|
+
if positions.size == 1
|
287
|
+
case positions.first
|
288
|
+
when Integer
|
289
|
+
positions.first
|
290
|
+
when Range
|
291
|
+
size.times.to_a[positions.first]
|
292
|
+
else
|
293
|
+
raise ArgumentError, 'Unknown position type.'
|
294
|
+
end
|
295
|
+
else
|
296
|
+
positions
|
297
|
+
end
|
298
|
+
end
|
299
|
+
end
|
300
|
+
end
|
301
|
+
end
|