daru 0.1.5 → 0.3

Sign up to get free protection for your applications and to get access to all the features.
Files changed (106) hide show
  1. checksums.yaml +5 -5
  2. data/.github/ISSUE_TEMPLATE.md +18 -0
  3. data/.gitignore +1 -0
  4. data/.rubocop.yml +21 -7
  5. data/.travis.yml +10 -5
  6. data/CONTRIBUTING.md +15 -10
  7. data/History.md +124 -2
  8. data/README.md +37 -9
  9. data/ReleasePolicy.md +20 -0
  10. data/benchmarks/db_loading.rb +34 -0
  11. data/benchmarks/statistics.rb +6 -6
  12. data/benchmarks/where_clause.rb +1 -1
  13. data/benchmarks/where_vs_filter.rb +1 -1
  14. data/daru.gemspec +17 -41
  15. data/lib/daru.rb +10 -13
  16. data/lib/daru/accessors/gsl_wrapper.rb +1 -1
  17. data/lib/daru/accessors/nmatrix_wrapper.rb +2 -0
  18. data/lib/daru/category.rb +29 -15
  19. data/lib/daru/configuration.rb +34 -0
  20. data/lib/daru/core/group_by.rb +158 -77
  21. data/lib/daru/core/merge.rb +12 -3
  22. data/lib/daru/core/query.rb +20 -4
  23. data/lib/daru/dataframe.rb +692 -118
  24. data/lib/daru/date_time/index.rb +14 -11
  25. data/lib/daru/date_time/offsets.rb +9 -1
  26. data/lib/daru/extensions/which_dsl.rb +55 -0
  27. data/lib/daru/formatters/table.rb +3 -5
  28. data/lib/daru/index/categorical_index.rb +4 -4
  29. data/lib/daru/index/index.rb +131 -42
  30. data/lib/daru/index/multi_index.rb +118 -10
  31. data/lib/daru/io/csv/converters.rb +21 -0
  32. data/lib/daru/io/io.rb +105 -33
  33. data/lib/daru/io/sql_data_source.rb +10 -0
  34. data/lib/daru/iruby/templates/dataframe.html.erb +4 -51
  35. data/lib/daru/iruby/templates/dataframe_mi.html.erb +3 -56
  36. data/lib/daru/iruby/templates/dataframe_mi_tbody.html.erb +35 -0
  37. data/lib/daru/iruby/templates/dataframe_mi_thead.html.erb +21 -0
  38. data/lib/daru/iruby/templates/dataframe_tbody.html.erb +28 -0
  39. data/lib/daru/iruby/templates/dataframe_thead.html.erb +21 -0
  40. data/lib/daru/iruby/templates/vector.html.erb +3 -25
  41. data/lib/daru/iruby/templates/vector_mi.html.erb +3 -34
  42. data/lib/daru/iruby/templates/vector_mi_tbody.html.erb +26 -0
  43. data/lib/daru/iruby/templates/vector_mi_thead.html.erb +8 -0
  44. data/lib/daru/iruby/templates/vector_tbody.html.erb +17 -0
  45. data/lib/daru/iruby/templates/vector_thead.html.erb +8 -0
  46. data/lib/daru/maths/arithmetic/vector.rb +38 -2
  47. data/lib/daru/maths/statistics/dataframe.rb +28 -30
  48. data/lib/daru/maths/statistics/vector.rb +295 -41
  49. data/lib/daru/plotting/gruff/dataframe.rb +13 -15
  50. data/lib/daru/plotting/nyaplot/category.rb +1 -1
  51. data/lib/daru/plotting/nyaplot/dataframe.rb +15 -4
  52. data/lib/daru/plotting/nyaplot/vector.rb +1 -2
  53. data/lib/daru/vector.rb +308 -96
  54. data/lib/daru/version.rb +1 -1
  55. data/profile/vector_new.rb +9 -0
  56. data/spec/accessors/gsl_wrapper_spec.rb +38 -35
  57. data/spec/accessors/nmatrix_wrapper_spec.rb +25 -22
  58. data/spec/category_spec.rb +24 -20
  59. data/spec/core/group_by_spec.rb +238 -4
  60. data/spec/core/merge_spec.rb +1 -1
  61. data/spec/core/query_spec.rb +65 -50
  62. data/spec/daru_spec.rb +22 -0
  63. data/spec/dataframe_spec.rb +473 -16
  64. data/spec/date_time/date_time_index_helper_spec.rb +72 -0
  65. data/spec/date_time/index_spec.rb +34 -16
  66. data/spec/date_time/offsets_spec.rb +14 -0
  67. data/spec/extensions/rserve_spec.rb +1 -1
  68. data/spec/extensions/which_dsl_spec.rb +38 -0
  69. data/spec/fixtures/boolean_converter_test.csv +5 -0
  70. data/spec/fixtures/duplicates.csv +32 -0
  71. data/spec/fixtures/eciresults.html +394 -0
  72. data/spec/fixtures/empty_rows_test.csv +17 -0
  73. data/spec/fixtures/macau.html +3691 -0
  74. data/spec/fixtures/macd_data.csv +150 -0
  75. data/spec/fixtures/matrix_test.csv +55 -55
  76. data/spec/fixtures/moneycontrol.html +6812 -0
  77. data/spec/fixtures/string_converter_test.csv +5 -0
  78. data/spec/fixtures/test_xls.xls +0 -0
  79. data/spec/fixtures/test_xls_2.xls +0 -0
  80. data/spec/fixtures/url_test.txt~ +0 -0
  81. data/spec/fixtures/valid_markup.html +62 -0
  82. data/spec/fixtures/wiki_climate.html +1243 -0
  83. data/spec/fixtures/wiki_table_info.html +631 -0
  84. data/spec/formatters/table_formatter_spec.rb +29 -0
  85. data/spec/index/categorical_index_spec.rb +33 -33
  86. data/spec/index/index_spec.rb +160 -41
  87. data/spec/index/multi_index_spec.rb +143 -33
  88. data/spec/io/io_spec.rb +246 -2
  89. data/spec/io/sql_data_source_spec.rb +31 -41
  90. data/spec/iruby/dataframe_spec.rb +17 -19
  91. data/spec/iruby/vector_spec.rb +26 -28
  92. data/spec/maths/arithmetic/dataframe_spec.rb +1 -1
  93. data/spec/maths/arithmetic/vector_spec.rb +18 -0
  94. data/spec/maths/statistics/vector_spec.rb +153 -15
  95. data/spec/plotting/gruff/category_spec.rb +3 -3
  96. data/spec/plotting/gruff/dataframe_spec.rb +14 -4
  97. data/spec/plotting/gruff/vector_spec.rb +9 -9
  98. data/spec/plotting/nyaplot/category_spec.rb +5 -9
  99. data/spec/plotting/nyaplot/dataframe_spec.rb +95 -47
  100. data/spec/plotting/nyaplot/vector_spec.rb +5 -11
  101. data/spec/shared/vector_display_spec.rb +12 -14
  102. data/spec/spec_helper.rb +30 -7
  103. data/spec/support/matchers.rb +5 -0
  104. data/spec/vector_spec.rb +306 -72
  105. metadata +96 -55
  106. data/spec/fixtures/stock_data.csv +0 -500
@@ -1,5 +1,5 @@
1
1
  module Daru
2
- class MultiIndex < Index
2
+ class MultiIndex < Index # rubocop:disable Metrics/ClassLength
3
3
  def each(&block)
4
4
  to_a.each(&block)
5
5
  end
@@ -9,11 +9,74 @@ module Daru
9
9
  end
10
10
 
11
11
  attr_reader :labels
12
+ attr_reader :name
12
13
 
13
14
  def levels
14
15
  @levels.map(&:keys)
15
16
  end
16
17
 
18
+ # names and levels should be of same size. If size of Array `name` is less
19
+ # or greater than size of array `levels` then it raises `SizeError`.
20
+ # If user don't want to put name for particular level then user must put
21
+ # empty string in that index of Array `name`.
22
+ # For example there is multi_index of 3 levels and user don't want to name
23
+ # level 0, then do multi_index.name = ['', 'level1_name1', 'level2_name']
24
+ #
25
+ # @example
26
+ #
27
+ # # set the name during initialization
28
+ #
29
+ # mi = Daru::MultiIndex.new(
30
+ # levels: [[:a,:b,:c], [:one, :two]],
31
+ # labels: [[0,0,1,1,2,2], [0,1,0,1,0,1]], name: ['s1', 's2'])
32
+ #
33
+ # # =>
34
+ # # <Daru::MultiIndex(6x2)>
35
+ # # s1 s2
36
+ # # a one
37
+ # # two
38
+ # # b one
39
+ # # two
40
+ # # c one
41
+ # # two
42
+ #
43
+ # # set new name
44
+ #
45
+ # mi.name = ['k1', 'k2']
46
+ # => ["k1", "k2"]
47
+ #
48
+ # mi
49
+ # =>
50
+ # # #<Daru::MultiIndex(6x2)>
51
+ # # k1 k2
52
+ # # a one
53
+ # # two
54
+ # # b one
55
+ # # two
56
+ # # c one
57
+ # # two
58
+ #
59
+ # # access the name
60
+ #
61
+ # mi.name
62
+ # => ["k1", "k2"]
63
+ #
64
+ # # If you don't want to name level 0
65
+ #
66
+ # mi.name = ['', 'k2']
67
+ # => ["", "k2"]
68
+ #
69
+ # mi
70
+ # =>
71
+ # #<Daru::MultiIndex(6x2)>
72
+ # # k2
73
+ # # a one
74
+ # # two
75
+ # # b one
76
+ # # two
77
+ # # c one
78
+ # # two
79
+ #
17
80
  def initialize opts={}
18
81
  labels = opts[:labels]
19
82
  levels = opts[:levels]
@@ -24,6 +87,12 @@ module Daru
24
87
 
25
88
  @labels = labels
26
89
  @levels = levels.map { |e| e.map.with_index.to_h }
90
+ self.name = opts[:name] unless opts[:name].nil?
91
+ end
92
+
93
+ def name=(names)
94
+ validate_name names, @labels
95
+ @name = names
27
96
  end
28
97
 
29
98
  def incorrect_fields?(_labels, levels)
@@ -84,7 +153,7 @@ module Daru
84
153
  # Returns positions given indexes or positions
85
154
  # @note If the arugent is both a valid index and a valid position,
86
155
  # it will treated as valid index
87
- # @param [Array<object>] *indexes indexes or positions
156
+ # @param indexes [Array<object>] indexes or positions
88
157
  # @example
89
158
  # idx = Daru::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
90
159
  # idx.pos :a
@@ -109,7 +178,7 @@ module Daru
109
178
 
110
179
  # Takes positional values and returns subset of the self
111
180
  # capturing the indexes at mentioned positions
112
- # @param [Array<Integer>] positional values
181
+ # @param positions [Array<Integer>] positional values
113
182
  # @return [object] index object
114
183
  # @example
115
184
  # idx = Daru::MultiIndex.from_tuples [[:a, :one], [:a, :two], [:b, :one], [:b, :two]]
@@ -128,12 +197,12 @@ module Daru
128
197
  end
129
198
 
130
199
  def add *indexes
131
- Daru::MultiIndex.from_tuples to_a << indexes
200
+ Daru::MultiIndex.from_tuples(to_a + [indexes])
132
201
  end
133
202
 
134
203
  def reorder(new_order)
135
204
  from = to_a
136
- self.class.from_tuples(new_order.map { |i| from[i] })
205
+ MultiIndex.from_tuples(new_order.map { |i| from[i] })
137
206
  end
138
207
 
139
208
  def try_retrieve_from_integer int
@@ -171,8 +240,42 @@ module Daru
171
240
  end
172
241
  end
173
242
 
243
+ def remove_layer layer_index
244
+ @levels.delete_at(layer_index)
245
+ @labels.delete_at(layer_index)
246
+ @name.delete_at(layer_index) unless @name.nil?
247
+
248
+ coerce_index
249
+ end
250
+
251
+ def coerce_index
252
+ if @levels.size == 1
253
+ elements = to_a.flatten
254
+
255
+ if elements.uniq.length == elements.length
256
+ Daru::Index.new(elements)
257
+ else
258
+ Daru::CategoricalIndex.new(elements)
259
+ end
260
+ else
261
+ self
262
+ end
263
+ end
264
+
265
+ # Array `name` must have same length as levels and labels.
266
+ def validate_name names, levels
267
+ error_msg = "'names' and 'levels' should be of same size. Size of the "\
268
+ "'name' array is #{names.size} and size of the MultiIndex 'levels' and "\
269
+ "'labels' is #{labels.size}."
270
+ suggestion_msg = "If you don\'t want to set name for particular level " \
271
+ "(say level 'i') then put empty string on index 'i' of the 'name' Array."
272
+
273
+ raise SizeError, error_msg if names.size > levels.size
274
+ raise SizeError, [error_msg, suggestion_msg].join("\n") if names.size < levels.size
275
+ end
276
+
174
277
  private :find_all_indexes, :multi_index_from_multiple_selections,
175
- :retrieve_from_range, :retrieve_from_tuples
278
+ :retrieve_from_range, :retrieve_from_tuples, :validate_name
176
279
 
177
280
  def key index
178
281
  raise ArgumentError, "Key #{index} is too large" if index >= @labels[0].size
@@ -183,7 +286,7 @@ module Daru
183
286
  end
184
287
 
185
288
  def dup
186
- MultiIndex.new levels: levels.dup, labels: labels
289
+ MultiIndex.new levels: levels.dup, labels: labels.dup, name: (@name.nil? ? nil : @name.dup)
187
290
  end
188
291
 
189
292
  def drop_left_level by=1
@@ -204,8 +307,9 @@ module Daru
204
307
 
205
308
  def include? tuple
206
309
  return false unless tuple.is_a? Enumerable
207
- tuple.flatten.each_with_index
208
- .all? { |tup, i| @levels[i][tup] }
310
+ @labels[0...tuple.flatten.size]
311
+ .transpose
312
+ .include?(tuple.flatten.each_with_index.map { |e, i| @levels[i][e] })
209
313
  end
210
314
 
211
315
  def size
@@ -232,7 +336,7 @@ module Daru
232
336
 
233
337
  def inspect threshold=20
234
338
  "#<Daru::MultiIndex(#{size}x#{width})>\n" +
235
- Formatters::Table.format([], row_headers: sparse_tuples, threshold: threshold)
339
+ Formatters::Table.format([], headers: @name, row_headers: sparse_tuples, threshold: threshold)
236
340
  end
237
341
 
238
342
  def to_html
@@ -262,5 +366,9 @@ module Daru
262
366
  [nil] * (cur.size - left.size) + left.map(&:first)
263
367
  }
264
368
  end
369
+
370
+ def to_df
371
+ Daru::DataFrame.new(@name.zip(to_a.transpose).to_h)
372
+ end
265
373
  end
266
374
  end
@@ -0,0 +1,21 @@
1
+ module Daru
2
+ module IO
3
+ module CSV
4
+ CONVERTERS = {
5
+ boolean: lambda { |f, _|
6
+ case f.downcase.strip
7
+ when 'true'
8
+ true
9
+ when 'false'
10
+ false
11
+ else
12
+ f
13
+ end
14
+ },
15
+ string: lambda { |f, _|
16
+ f
17
+ }
18
+ }.freeze
19
+ end
20
+ end
21
+ end
@@ -1,4 +1,5 @@
1
1
  module Daru
2
+ require_relative 'csv/converters.rb'
2
3
  module IOHelpers
3
4
  class << self
4
5
  def process_row(row,empty)
@@ -39,14 +40,11 @@ module Daru
39
40
 
40
41
  def from_excel path, opts={}
41
42
  opts = {
42
- worksheet_id: 0
43
+ worksheet_id: 0,
44
+ row_id: 0
43
45
  }.merge opts
44
46
 
45
- worksheet_id = opts[:worksheet_id]
46
- book = Spreadsheet.open path
47
- worksheet = book.worksheet worksheet_id
48
- headers = ArrayHelper.recode_repeated(worksheet.row(0)).map(&:to_sym)
49
-
47
+ worksheet, headers = read_from_excel(path, opts)
50
48
  df = Daru::DataFrame.new({})
51
49
  headers.each_with_index do |h,i|
52
50
  col = worksheet.column(i).to_a
@@ -57,6 +55,18 @@ module Daru
57
55
  df
58
56
  end
59
57
 
58
+ def read_from_excel path, opts
59
+ optional_gem 'spreadsheet', '~>1.1.1'
60
+
61
+ worksheet_id = opts[:worksheet_id]
62
+ row_id = opts[:row_id]
63
+ book = Spreadsheet.open path
64
+ worksheet = book.worksheet worksheet_id
65
+ headers = ArrayHelper.recode_repeated(worksheet.row(row_id)).map(&:to_sym)
66
+
67
+ [worksheet, headers]
68
+ end
69
+
60
70
  def dataframe_write_excel dataframe, path, _opts={}
61
71
  book = Spreadsheet::Workbook.new
62
72
  sheet = book.create_worksheet
@@ -76,7 +86,6 @@ module Daru
76
86
  # Functions for loading/writing CSV files
77
87
  def from_csv path, opts={}
78
88
  daru_options, opts = from_csv_prepare_opts opts
79
-
80
89
  # Preprocess headers for detecting and correcting repetition in
81
90
  # case the :headers option is not specified.
82
91
  hsh =
@@ -86,7 +95,6 @@ module Daru
86
95
  from_csv_hash(path, opts)
87
96
  .tap { |hash| daru_options[:order] = hash.keys }
88
97
  end
89
-
90
98
  Daru::DataFrame.new(hsh,daru_options)
91
99
  end
92
100
 
@@ -111,11 +119,10 @@ module Daru
111
119
 
112
120
  # Execute a query and create a data frame from the result
113
121
  #
114
- # @param dbh [DBI::DatabaseHandle] A DBI connection to be used to run the query
122
+ # @param db [DBI::DatabaseHandle, String] A DBI connection OR Path to a SQlite3 database.
115
123
  # @param query [String] The query to be executed
116
124
  #
117
125
  # @return A dataframe containing the data resulting from the query
118
-
119
126
  def from_sql(db, query)
120
127
  require 'daru/io/sql_data_source'
121
128
  SqlDataSource.make_dataframe(db, query)
@@ -135,23 +142,11 @@ module Daru
135
142
  #
136
143
  # @return A dataframe containing the data in the given relation
137
144
  def from_activerecord(relation, *fields)
138
- if fields.empty?
139
- records = relation.map do |record|
140
- record.attributes.symbolize_keys
141
- end
142
- return Daru::DataFrame.new(records)
143
- else
144
- fields = fields.map(&:to_sym)
145
- end
146
-
147
- vectors = fields.map { |name| [name, Daru::Vector.new([], name: name)] }.to_h
145
+ fields = relation.klass.column_names if fields.empty?
146
+ fields = fields.map(&:to_sym)
148
147
 
149
- Daru::DataFrame.new(vectors, order: fields).tap do |df|
150
- relation.pluck(*fields).each do |record|
151
- df.add_row(Array(record))
152
- end
153
- df.update
154
- end
148
+ result = relation.pluck(*fields).transpose
149
+ Daru::DataFrame.new(result, order: fields).tap(&:update)
155
150
  end
156
151
 
157
152
  # Loading data from plain text files
@@ -186,13 +181,34 @@ module Daru
186
181
  end
187
182
  end
188
183
 
184
+ def from_html path, opts
185
+ optional_gem 'mechanize', '~>2.7.5'
186
+ page = Mechanize.new.get(path)
187
+ page.search('table').map { |table| html_parse_table table }
188
+ .keep_if { |table| html_search table, opts[:match] }
189
+ .compact
190
+ .map { |table| html_decide_values table, opts }
191
+ .map { |table| html_table_to_dataframe table }
192
+ end
193
+
189
194
  private
190
195
 
191
- DARU_OPT_KEYS = [:clone, :order, :index, :name].freeze
196
+ def optional_gem(name, version)
197
+ gem name, version
198
+ require name
199
+ rescue LoadError
200
+ Daru.error "\nInstall the #{name} gem version #{version} for using"\
201
+ " #{name} functions."
202
+ end
203
+
204
+ DARU_OPT_KEYS = %i[clone order index name].freeze
192
205
 
193
206
  def from_csv_prepare_opts opts
194
207
  opts[:col_sep] ||= ','
195
- opts[:converters] ||= :numeric
208
+ opts[:skip_blanks] ||= true
209
+ opts[:converters] ||= [:numeric]
210
+
211
+ opts[:converters] = from_csv_prepare_converters(opts[:converters])
196
212
 
197
213
  daru_options = opts.keys.each_with_object({}) do |k, hash|
198
214
  hash[k] = opts.delete(k) if DARU_OPT_KEYS.include?(k)
@@ -200,11 +216,22 @@ module Daru
200
216
  [daru_options, opts]
201
217
  end
202
218
 
219
+ def from_csv_prepare_converters(converters)
220
+ Array(converters).flat_map do |c|
221
+ if ::CSV::Converters[c]
222
+ ::CSV::Converters[c]
223
+ elsif Daru::IO::CSV::CONVERTERS[c]
224
+ Daru::IO::CSV::CONVERTERS[c]
225
+ else
226
+ c
227
+ end
228
+ end
229
+ end
230
+
203
231
  def from_csv_hash_with_headers(path, opts)
204
232
  opts[:header_converters] ||= :symbol
205
-
206
233
  ::CSV
207
- .read(path, 'rb',opts)
234
+ .parse(open(path), opts)
208
235
  .tap { |c| yield c if block_given? }
209
236
  .by_col.map { |col_name, values| [col_name, values] }.to_h
210
237
  end
@@ -212,15 +239,60 @@ module Daru
212
239
  def from_csv_hash(path, opts)
213
240
  csv_as_arrays =
214
241
  ::CSV
215
- .open(path, 'rb', opts)
242
+ .parse(open(path), **opts)
216
243
  .tap { |c| yield c if block_given? }
217
244
  .to_a
218
-
219
245
  headers = ArrayHelper.recode_repeated(csv_as_arrays.shift)
220
246
  csv_as_arrays = csv_as_arrays.transpose
221
-
222
247
  headers.each_with_index.map { |h, i| [h, csv_as_arrays[i]] }.to_h
223
248
  end
249
+
250
+ def html_parse_table(table)
251
+ headers, headers_size = html_scrape_tag(table,'th')
252
+ data, size = html_scrape_tag(table, 'td')
253
+ data = data.keep_if { |x| x.count == size }
254
+ order, indice = html_parse_hash(headers, size, headers_size) if headers_size >= size
255
+ return unless (indice.nil? || indice.count == data.count) && !order.nil? && order.count>0
256
+ {data: data.compact, index: indice, order: order}
257
+ end
258
+
259
+ def html_scrape_tag(table, tag)
260
+ arr = table.search('tr').map { |row| row.search(tag).map { |val| val.text.strip } }
261
+ size = arr.map(&:count).max
262
+ [arr, size]
263
+ end
264
+
265
+ # Splits headers (all th tags) into order and index. Wherein,
266
+ # Order : All <th> tags on first proper row of HTML table
267
+ # index : All <th> tags on first proper column of HTML table
268
+ def html_parse_hash(headers, size, headers_size)
269
+ headers_index = headers.find_index { |x| x.count == headers_size }
270
+ order = headers[headers_index]
271
+ order_index = order.count - size
272
+ order = order[order_index..-1]
273
+ indice = headers[headers_index+1..-1].flatten
274
+ indice = nil if indice.to_a.empty?
275
+ [order, indice]
276
+ end
277
+
278
+ def html_search(table, match=nil)
279
+ match.nil? ? true : (table.to_s.include? match)
280
+ end
281
+
282
+ # Allows user to override the scraped order / index / data
283
+ def html_decide_values(scraped_val={}, user_val={})
284
+ %I[data index name order].each do |key|
285
+ user_val[key] ||= scraped_val[key]
286
+ end
287
+ user_val
288
+ end
289
+
290
+ def html_table_to_dataframe(table)
291
+ Daru::DataFrame.rows table[:data],
292
+ index: table[:index],
293
+ order: table[:order],
294
+ name: table[:name]
295
+ end
224
296
  end
225
297
  end
226
298
  end