RubyGems - daru - Versions diffs - 0.1.2 → 0.1.3 - Mend

daru 0.1.2 → 0.1.3

Files changed (49) hide show

checksums.yaml +4 -4
data/.gitignore +2 -0
data/.rubocop.yml +99 -0
data/.rubocop_todo.yml +44 -0
data/.travis.yml +3 -1
data/CONTRIBUTING.md +5 -1
data/History.md +43 -0
data/README.md +3 -4
data/benchmarks/duplicating.rb +45 -0
data/benchmarks/group_by.rb +7 -7
data/benchmarks/joining.rb +52 -0
data/benchmarks/sorting.rb +9 -2
data/benchmarks/statistics.rb +39 -0
data/daru.gemspec +4 -4
data/lib/daru.rb +9 -9
data/lib/daru/accessors/array_wrapper.rb +15 -11
data/lib/daru/accessors/dataframe_by_row.rb +1 -1
data/lib/daru/accessors/gsl_wrapper.rb +30 -19
data/lib/daru/accessors/mdarray_wrapper.rb +1 -3
data/lib/daru/accessors/nmatrix_wrapper.rb +15 -15
data/lib/daru/core/group_by.rb +69 -16
data/lib/daru/core/merge.rb +135 -151
data/lib/daru/core/query.rb +9 -30
data/lib/daru/dataframe.rb +476 -439
data/lib/daru/date_time/index.rb +150 -137
data/lib/daru/date_time/offsets.rb +45 -41
data/lib/daru/extensions/rserve.rb +4 -4
data/lib/daru/index.rb +88 -64
data/lib/daru/io/io.rb +33 -34
data/lib/daru/io/sql_data_source.rb +11 -11
data/lib/daru/maths/arithmetic/dataframe.rb +19 -19
data/lib/daru/maths/arithmetic/vector.rb +9 -14
data/lib/daru/maths/statistics/dataframe.rb +89 -61
data/lib/daru/maths/statistics/vector.rb +226 -97
data/lib/daru/monkeys.rb +23 -30
data/lib/daru/plotting/dataframe.rb +27 -28
data/lib/daru/plotting/vector.rb +12 -13
data/lib/daru/vector.rb +221 -330
data/lib/daru/version.rb +2 -2
data/spec/core/group_by_spec.rb +16 -0
data/spec/core/merge_spec.rb +30 -14
data/spec/dataframe_spec.rb +268 -14
data/spec/index_spec.rb +23 -5
data/spec/io/io_spec.rb +37 -16
data/spec/math/statistics/dataframe_spec.rb +40 -8
data/spec/math/statistics/vector_spec.rb +135 -10
data/spec/monkeys_spec.rb +3 -3
data/spec/vector_spec.rb +157 -25
metadata +41 -21

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA1:
-  metadata.gz: ed2a3e2a4cd9fce8d95af6aac9c3db532eed444f
-  data.tar.gz: 90ca6a62ee824d20f72a9f6689c03f27d7667168
+  metadata.gz: 6a72d4b2565e47c5c4112aac514a7191bd4f962c
+  data.tar.gz: 2f68b0bb56e621f36d32f6bb9ecc541a61af7323
 SHA512:
-  metadata.gz: e6f3345ef4372e1c45a3d80c0cc61c2b4c72e4c810cfb183f30bfd9285a09639ea39cd0a3597fc63551d7f72398d8d83af4424855018e8a5b2a99274b46625cd
-  data.tar.gz: 65d262b1deec54680a5fdcfecda3530c9fb9450dbd280c18833b655521418ed340f311253221dfd4e018577b063f9d3638d0d600a108c3b98ec5a7cd2dfe98ec
+  metadata.gz: 950a36a9956dd37ac334bd1b657cfb8a74994b1c771846b359d8c885d8ec3c62edb75df18a951086799c77a2fe7c3425c64c0f069fd85b50470095d13ba323c0
+  data.tar.gz: 0b9d8815d90f947a7a2dcf5447dd46057dbe559ecd3959e88e4932afb945183a168c46fb131bdaca9c07682a28e408d3d19b2e716b57ddad0638b375371f6c3b

data/.gitignore CHANGED

@@ -2,3 +2,5 @@
 Gemfile.lock
 doc/
 .yardoc/
+.bundle
+vendor/

data/.rubocop.yml ADDED

@@ -0,0 +1,99 @@
+inherit_from: .rubocop_todo.yml
+AllCops:
+  Include:
+    - 'lib/**/*'
+  Exclude:
+    - 'spec/*'
+    - 'spec/**/*'
+    - 'vendor/**/*'
+    - 'benchmarks/*'
+  DisplayCopNames: true
+# Preferred codebase style ---------------------------------------------
+Style/ExtraSpacing:
+  AllowForAlignment: true
+Style/FormatString:
+  EnforcedStyle: percent
+Style/AndOr:
+  EnforcedStyle: conditionals
+Style/SpaceAroundEqualsInParameterDefault:
+  EnforcedStyle: no_space
+Style/SpaceInsideBlockBraces:
+  EnforcedStyle: space
+Style/SpaceInsideHashLiteralBraces:
+  EnforcedStyle: no_space
+Style/AlignParameters:
+  EnforcedStyle: with_fixed_indentation
+Style/EmptyElse:
+  EnforcedStyle: empty
+Style/ParallelAssignment:
+  Enabled: false
+Style/DoubleNegation:
+  Enabled: false
+Style/SingleLineBlockParams:
+  Enabled: false
+Style/PerlBackrefs:
+  Enabled: false
+Style/SpaceAfterComma:
+  Enabled: false
+Style/SpaceAroundOperators:
+  Enabled: false
+Style/EmptyCaseCondition:
+  Enabled: false
+# Neither of prefered styles are good enough :(
+Style/BlockDelimiters:
+  Enabled: false
+# TODO -----------------------------------------------------------------
+Style/Documentation:
+  Enabled: false
+# To discuss and decide ------------------------------------------------
+# FIXME: in fact, rescue modifier is rarely a good choice.
+#  But currently I can't fully grasp the three places they are used.
+#  So, leaving them intact. - zverok, 2016-05-07
+Style/RescueModifier:
+  Exclude:
+    - 'lib/daru/accessors/gsl_wrapper.rb'
+    - 'lib/daru/dataframe.rb'
+    - 'lib/daru/io/sql_data_source.rb'
+# FIXME: once we should enable and fix it - zverok, 2016-05-07
+Style/Alias:
+  Enabled: false
+# FIXME: should decide about this.
+# Personally I prefer (as most of Ruby community) to use parens, but
+# we also can enforce style to NOT using them. Yet it definitely should
+# be only one style. Current codebase uses ~400 method defs without and
+# ~ 100 method defs with them. - zverok, 2016-05-07
+Style/MethodDefParentheses:
+  Enabled: false
+# Should be fixed, but require change of public API --------------------
+# Bans methods like `has_missing_data?`, `is_number?` and so on - started
+# with unnecessary has_ or is_.
+Style/PredicateName:
+  Exclude:
+    - 'lib/daru/dataframe.rb'
+    - 'lib/daru/monkeys.rb'
+    - 'lib/daru/vector.rb'

data/.rubocop_todo.yml ADDED

@@ -0,0 +1,44 @@
+# This configuration was generated by
+# `rubocop --auto-gen-config`
+# on 2016-05-06 16:48:54 +0300 using RuboCop version 0.39.0.
+# The point is for the user to remove these configuration records
+# one by one as the offenses are removed from the code base.
+# Note that changes in the inspected code, or installation of new
+# versions of RuboCop, may require this file to be generated again.
+# Offense count: 66
+Metrics/AbcSize:
+  Max: 110
+# Offense count: 6
+Metrics/BlockNesting:
+  Max: 6
+# Offense count: 6
+# Configuration parameters: CountComments.
+Metrics/ClassLength:
+  Max: 1400
+# Offense count: 26
+Metrics/CyclomaticComplexity:
+  Max: 22
+# Offense count: 273
+# Configuration parameters: AllowHeredoc, AllowURI, URISchemes.
+# URISchemes: http, https
+Metrics/LineLength:
+  Max: 164
+# Offense count: 81
+# Configuration parameters: CountComments.
+Metrics/MethodLength:
+  Max: 100
+# Offense count: 3
+# Configuration parameters: CountComments.
+Metrics/ModuleLength:
+  Max: 419
+# Offense count: 22
+Metrics/PerceivedComplexity:
+  Max: 28

data/.travis.yml CHANGED

@@ -11,7 +11,9 @@ matrix:
   fast_finish:
     true
-script: "bundle exec rspec"
+script:
+  - bundle exec rspec
+  - bundle exec rubocop
 install:
   - gem install bundler

data/CONTRIBUTING.md CHANGED

@@ -21,6 +21,10 @@ And run the test suite (should be all green with pending tests):
 If you have problems installing nmatrix, please consult the [nmatrix installation wiki](https://github.com/SciRuby/nmatrix/wiki/Installation) or the [mailing list](https://groups.google.com/forum/#!forum/sciruby-dev).
+While preparing your pull requests, don't forget to check your code with Rubocop:
+  `bundle exec rubocop`
 ## Daru internals
-To get an overview of certain internals of daru and their implementation, go over [this blog post](http://v0dro.github.io/blog/2015/08/16/elaboration-on-certain-internals-of-daru/).
+To get an overview of certain internals of daru and their implementation, go over [this blog post](http://v0dro.github.io/blog/2015/08/16/elaboration-on-certain-internals-of-daru/).

data/History.md CHANGED

@@ -1,3 +1,46 @@
+# 0.1.3 (May 2016)
+* Enhancements
+    - Proper error handling for case where an index specified by the user is not actually present in the DataFrame/Vector (@lokeshh).
+    - DataFrame CSV writer function will now supress headers when passing headers: false (@gnilrets).
+    - Refactor Index and MultiIndex so that a Vector or DataFrame can access the actual index number without having to check the exact type of index every time (@lokeshh).
+    - Refactor `Vector#[]=` to not use conditionals (@lokeshh).
+    - Custom `#dup` method for `Daru::DateTimeIndex` (@Deepakkoli93).
+    - Massive performance boost to Vector and DataFrame sorting by using in-built Array#sort and removing previous hand-made sort (@lokeshh).
+    - Handle nils in sorting for Vectors and DataFrame (@lokeshh, @gnilrets).
+    - Add #describe function for Vectors (@shahsaurabh0605).
+    - Adds support for concatenating dataframes that don't share all the same vectors (@gnilrets).
+    - Massive performance enhancement for joins using the sorted merge method (@gnilrets).
+    - New statistics methods and tests for DataFrame (@shahsaurabh0605).
+    - Add explicit conversion to hash for DataFrame (DataFrame#to_h, Vector#to_h) and remove implicit conversion to hash (DataFrame#to_hash, Vector#to_hash) (@gnilrets).
+    - Add `DataFrame#rename_vectors` for simplifying renaming of vectors in DataFrame (@gnilrets).
+    - MultiIndex raises error on accessing an invalid index (@shreyanshd).
+    - Order columns as given in the CSV file when reading into a DataFrame from CSV using `DataFrame.from_csv` (@lokeshh).
+    - Add `Vector#percent_change` and `DataFrame#percent_change` (@shahsaurabh0605).
+    - Faster `DataFrame#filter_rows` (@lokeshh).
+    - Added `Vector#emv` for calculating exponential moving variance of Vector (@shahsaurabh0605).
+    - Add support for associating metadata with a Vector or DataFrame using the :metadata option (@gnilrets).
+    - Add `Vector#emsd` for calculating exponential moving standard deviation of Vector (@shahsaurabh0605).
+    - Sample and population covariance functions for Vector (@shahsaurabh0605).
+    - Improve `DataFrame#dup` performance (@gnilrets).
+    - Add `Daru::DataFrame::Core::GroupBy#reduce` for reducing groups by passing a block (@gnilrets).
+    - Add rubocop as development dependency and make changes suggested by it to conform to the Ruby Style Guide (@zverok).
+    - Allow Daru::Index to be initialized by a Range (@lokeshh).
+* Fixes
+    - Fix conflict with narray that caused namespace clashes with nmatrix in case both narray and nmatrix were installed on the user's system (@lokeshh).
+    - Fix bug with dataframe concatenation that caused modifying the arrays that
+    compose the vectors in the original dataframes (@gnilrets).
+    - Fix an error where the Vectors in an empty DataFrame would not be assigned correct names (@lokeshh).
+    - Correct spelling mistakes and fix broken links in README (@lokeshh).
+    - Fix bug in Vector#mode (@sunshineyyy).
+    - Fix `Vector#index_of` method to handle dtype :array differently (@lokeshh).
+    - Fix `DateTimeIndex#include?` method since it was raising an exception when index not found. It returns false now (@Phitherek).
+    - Handle nils in group_by keys (@gnilrets).
+    - Handle nils for statistics methods in Vector and DataFrame for :array and :gsl data (@lokeshh).
+    - Fix `DataFrame#clone` when no arguments have been passed to it (@lokeshh).
+    - Fix bug when joining empty dataframes (@gnilrets).
 # 0.1.2
 * Enhancements

data/README.md CHANGED

@@ -7,7 +7,7 @@
 daru (Data Analysis in RUby) is a library for storage, analysis, manipulation and visualization of data in Ruby.
-daru makes it easy and intuituive to process data predominantly through 2 data structures: `Daru::DataFrame` and `Daru::Vector`. Written in pure Ruby works with all ruby implementations. Tested with MRI 2.0, 2.1, 2.2 and 2.3.
+daru makes it easy and intuitive to process data predominantly through 2 data structures: `Daru::DataFrame` and `Daru::Vector`. Written in pure Ruby works with all ruby implementations. Tested with MRI 2.0, 2.1, 2.2 and 2.3.
 ## Features
@@ -16,7 +16,7 @@ daru makes it easy and intuituive to process data predominantly through 2 data s
     - DataFrame - A 2-D spreadsheet-like structure for manipulating and storing data sets. This is daru's primary data structure.
 * Compatible with [IRuby notebook](https://github.com/SciRuby/iruby), [statsample](https://github.com/SciRuby/statsample), [statsample-glm](https://github.com/SciRuby/statsample-glm) and [statsample-timeseries](https://github.com/SciRuby/statsample-timeseries).
 * Support for time series.
-* Singly and hierarchially indexed data structures.
+* Singly and hierarchically indexed data structures.
 * Flexible and intuitive API for manipulation and analysis of data.
 * Easy plotting, statistics and arithmetic.
 * Plentiful iterators.
@@ -150,7 +150,7 @@ data_frame.where(
 *Plotting*
-Daru supports plotting of interactive graphs with [nyaplot](). You can easily create a plot with the `#plot` method. Here we plot the gallons sold on the Y axis and name of the brand on the X axis in a bar graph.
+Daru supports plotting of interactive graphs with [nyaplot](https://github.com/domitry/nyaplot). You can easily create a plot with the `#plot` method. Here we plot the gallons sold on the Y axis and name of the brand on the X axis in a bar graph.
 ``` ruby
 data_frame.plot type: :bar, x: 'Beer', y: 'Gallons sold' do |plot, diagram|
   plot.x_label "Beer"
@@ -179,7 +179,6 @@ Docs can be found [here](https://rubygems.org/gems/daru).
 * Statistics on DataFrame over rows.
 * Calculate percentage change.
 * Have some sample data sets for users to play around with. Should be able to load these from the code itself.
-* Sorting with missing data present.
 ## Contributing

data/benchmarks/duplicating.rb ADDED

@@ -0,0 +1,45 @@
+$:.unshift File.expand_path("../../lib", __FILE__)
+require 'benchmark'
+require 'daru'
+# Check scaling
+base_n = 10000
+0.upto(2) do |iscale|
+  n = base_n * 2**iscale
+  df_h = ('a'..'z').map { |v| v.to_sym }.reduce({}) do |h, v|
+    h[v] = Daru::Vector.new(1.upto(n).to_a)
+    h
+  end
+  df = Daru::DataFrame.new(df_h)
+  Benchmark.bm do |bm|
+    bm.report("dupe (n=#{n})") do
+      df.dup
+    end
+  end
+end
+#                   ===== Benchmarks =====
+# System: iMac Late 2013 3.5GHz Core i7
+#
+#       user     system      total        real
+#dupe (n=10000)  0.590000   0.020000   0.610000 (  0.613648)
+#       user     system      total        real
+#dupe (n=20000)  1.170000   0.040000   1.210000 (  1.236629)
+#       user     system      total        real
+#dupe (n=40000)  2.390000   0.070000   2.460000 (  2.511199)
+#                   ===== Prior Benchmarks (Daru 0.1.2 - 2707559369c03894a8394714820aabf116b99b20 - 2016-04-25) =====
+# Note that the n here is 100x smaller than above
+#       user     system      total        real
+#dupe (n=100)  0.220000   0.000000   0.220000 (  0.227924)
+#       user     system      total        real
+#dupe (n=200)  0.850000   0.000000   0.850000 (  0.856591)
+#       user     system      total        real
+#dupe (n=400)  3.370000   0.020000   3.390000 (  3.428211)

data/benchmarks/group_by.rb CHANGED

@@ -7,11 +7,11 @@ data = Daru::DataFrame.from_csv 'TradeoffData.csv'
 Benchmark.bm do |x|
   x.report("Single column grouping") do
-    @single = data.group_by([:Treatment])
+    @single = data.group_by(['Treatment'])
   end
   x.report("Multi-column grouping") do
-    @multi = data.group_by([:Group, :Treatment])
+    @multi = data.group_by(['Group', 'Treatment'])
   end
   x.report("Single mean") do
@@ -24,9 +24,9 @@ Benchmark.bm do |x|
 end
 #                    ===== Benchmarks =====
-#
+#
 #                          user     system      total        real
-# Single column grouping 0.000000   0.000000   0.000000   (0.000356)
-# Multi-column grouping  0.000000   0.000000   0.000000   (0.000958)
-# Single mean            0.000000   0.000000   0.000000   (0.000865)
-# Multi mean             0.000000   0.000000   0.000000   (0.002748)
+# Single column grouping  0.000000   0.000000   0.000000  (0.000340)
+# Multi-column grouping   0.000000   0.000000   0.000000  (0.000855)
+# Single mean             0.000000   0.000000   0.000000  (0.001208)
+# Multi mean              0.000000   0.000000   0.000000  (0.004892)

data/benchmarks/joining.rb ADDED

@@ -0,0 +1,52 @@
+$:.unshift File.expand_path("../../lib", __FILE__)
+require 'benchmark'
+require 'daru'
+# Check scaling
+base_n = 10000
+0.upto(2) do |iscale|
+  n = base_n * 2**iscale
+  keys = (1..(n)).to_a
+  base_data = { idx: 1.upto(n).to_a, keys: 1.upto(n).map { |v| keys[Random.rand(n)]}}
+  lookup_hash = keys.map { |k| [k, k * 100]}.to_h
+  base_data_df = Daru::DataFrame.new(base_data)
+  lookup_df = Daru::DataFrame.new({ keys: lookup_hash.keys, values: lookup_hash.values })
+  Benchmark.bm do |bm|
+    bm.report("Inner join (n=#{n})") do
+      base_data_df.join(lookup_df, on: [:keys], how: :inner)
+    end
+    bm.report("Outer join (n=#{n})") do
+      base_data_df.join(lookup_df, on: [:keys], how: :outer)
+    end
+  end
+end
+#                   ===== Benchmarks =====
+# System: MacBook Pro Mid 2014 3GHz Core i7
+#
+#       user     system      total        real
+#Inner join (n=10000)  0.170000   0.000000   0.170000 (  0.182254)
+#Outer join (n=10000)  0.200000   0.000000   0.200000 (  0.203022)
+#       user     system      total        real
+#Inner join (n=20000)  0.380000   0.000000   0.380000 (  0.387600)
+#Outer join (n=20000)  0.410000   0.000000   0.410000 (  0.415644)
+#       user     system      total        real
+#Inner join (n=40000)  0.720000   0.010000   0.730000 (  0.743787)
+#Outer join (n=40000)  0.810000   0.010000   0.820000 (  0.840871)
+#                   ===== Prior Benchmarks (Daru 0.1.2 - prior to sorted merge algorithm) =====
+# Note that the n here is 10x smaller than above
+#       user     system      total        real
+#Inner join (n=1000)  0.170000   0.010000   0.180000 (  0.175585)
+#Outer join (n=1000)  0.990000   0.000000   0.990000 (  1.004305)
+#       user     system      total        real
+#Inner join (n=2000)  0.440000   0.010000   0.450000 (  0.446748)
+#Outer join (n=2000)  3.880000   0.010000   3.890000 (  3.926399)
+#       user     system      total        real
+#Inner join (n=4000)  1.670000   0.010000   1.680000 (  1.680742)
+#Outer join (n=4000) 15.640000   0.060000  15.700000 ( 15.855202)

data/benchmarks/sorting.rb CHANGED

@@ -28,8 +28,8 @@ Benchmark.bm do |x|
   x.report("Sort two columns with custom operators in different orders of DataFrame") do
     df.sort([:c,:a], ascending: [true, false],
-      by: { c: lambda { |a,b| a.to_s <=> b.to_s },
-            a: lambda { |a,b| (a+1) <=> (b+1) } })
+      by: { c: lambda { |a| a.to_s },
+            a: lambda { |a| a+1 } })
   end
 end
@@ -42,3 +42,10 @@ end
 # Sort single column of DataFrame                                          2502.450000 0.000000 2502.450000 (2503.808073)
 # Sort two columns of DataFrame                                            0.540000    0.000000 0.540000    (  0.537670)
 # Sort two columns with custom operators in different orders of DataFrame  2084.160000 7.260000 2091.420000 (2092.716603)
+#                                         ===== Current Benchamarks =====
+# Sort a Vector without any args                                           0.070000   0.000000   0.070000 (  0.070323)
+# Sort vector in descending order with custom <=> operator                 0.120000   0.000000   0.120000 (  0.119462)
+# Sort single column of DataFrame                                          0.940000   0.010000   0.950000 (  0.950349)
+# Sort two columns of DataFrame                                            1.490000   0.010000   1.500000 (  1.505680)
+# Sort two columns with custom operators in different orders of DataFrame  1.480000   0.000000   1.480000 (  1.495839)

data/benchmarks/statistics.rb ADDED

@@ -0,0 +1,39 @@
+require 'daru'
+require 'benchmark'
+vector = Daru::Vector.new(
+  (10**6).times.map.to_a.shuffle,
+  missing_values: 100.times.map.to_a.shuffle
+  )
+vector_gsl = Daru::Vector.new(
+  10000.times.map.to_a.shuffle,
+  missing_values: 100.times.map.to_a.shuffle,
+  dtype: :gsl
+  )
+Benchmark.bm do |x|
+  x.report("Mean of a vector") do
+    vector.mean
+  end
+  x.report("Minimum of a vector") do
+    vector.min
+  end
+  x.report("Mean of a vector with data type gsl") do
+    vector_gsl.mean
+  end
+  x.report "Minimum of a vector with data type gsl" do
+    vector_gsl.min
+  end
+end
+#                    ===== Benchmarks =====
+#
+#                                     user     system      total        real
+# Mean of a vector                 0.130000   0.010000   0.140000 (  0.145534)
+# Min of a vector                  0.150000   0.000000   0.150000 (  0.163623)
+# Mean of a gsl vector             0.000000   0.000000   0.000000 (  0.001037)
+# Min of a gsl vector              0.000000   0.000000   0.000000 (  0.001251)