RubyGems - polars-df - Versions diffs - 0.1.0 → 0.1.2 - Mend

polars-df 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +8 -0
data/Cargo.lock +1946 -0
data/Cargo.toml +5 -0
data/ext/polars/Cargo.toml +31 -1
data/ext/polars/src/batched_csv.rs +120 -0
data/ext/polars/src/conversion.rs +336 -42
data/ext/polars/src/dataframe.rs +409 -4
data/ext/polars/src/error.rs +9 -0
data/ext/polars/src/file.rs +8 -7
data/ext/polars/src/lazy/apply.rs +7 -0
data/ext/polars/src/lazy/dataframe.rs +436 -10
data/ext/polars/src/lazy/dsl.rs +1134 -5
data/ext/polars/src/lazy/meta.rs +41 -0
data/ext/polars/src/lazy/mod.rs +2 -0
data/ext/polars/src/lib.rs +390 -3
data/ext/polars/src/series.rs +175 -13
data/lib/polars/batched_csv_reader.rb +95 -0
data/lib/polars/cat_expr.rb +13 -0
data/lib/polars/data_frame.rb +892 -21
data/lib/polars/date_time_expr.rb +143 -0
data/lib/polars/expr.rb +503 -0
data/lib/polars/io.rb +342 -2
data/lib/polars/lazy_frame.rb +338 -6
data/lib/polars/lazy_functions.rb +158 -11
data/lib/polars/list_expr.rb +108 -0
data/lib/polars/meta_expr.rb +33 -0
data/lib/polars/series.rb +1304 -14
data/lib/polars/string_expr.rb +117 -0
data/lib/polars/struct_expr.rb +27 -0
data/lib/polars/utils.rb +60 -0
data/lib/polars/version.rb +1 -1
data/lib/polars.rb +15 -1
metadata +13 -2

data/lib/polars/io.rb CHANGED Viewed

@@ -1,8 +1,245 @@
 module Polars
   module IO
-    def read_csv(file, has_header: true)
+    def read_csv(
+      file,
+      has_header: true,
+      columns: nil,
+      new_columns: nil,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      parse_dates: false,
+      n_threads: nil,
+      infer_schema_length: 100,
+      batch_size: 8192,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      storage_options: nil,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      sample_size: 1024,
+      eol_char: "\n"
+    )
+      _check_arg_is_1byte("sep", sep, false)
+      _check_arg_is_1byte("comment_char", comment_char, false)
+      _check_arg_is_1byte("quote_char", quote_char, true)
+      _check_arg_is_1byte("eol_char", eol_char, false)
+      projection, columns = Utils.handle_projection_columns(columns)
+      storage_options ||= {}
+      if columns && !has_header
+        columns.each do |column|
+          if !column.start_with?("column_")
+            raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
+          end
+        end
+      end
+      if projection || new_columns
+        raise Todo
+      end
+      df = nil
       _prepare_file_arg(file) do |data|
-        DataFrame._read_csv(data, has_header: has_header)
+        df = DataFrame._read_csv(
+          data,
+          has_header: has_header,
+          columns: columns || projection,
+          sep: sep,
+          comment_char: comment_char,
+          quote_char: quote_char,
+          skip_rows: skip_rows,
+          dtypes: dtypes,
+          null_values: null_values,
+          ignore_errors: ignore_errors,
+          parse_dates: parse_dates,
+          n_threads: n_threads,
+          infer_schema_length: infer_schema_length,
+          batch_size: batch_size,
+          n_rows: n_rows,
+          encoding: encoding == "utf8-lossy" ? encoding : "utf8",
+          low_memory: low_memory,
+          rechunk: rechunk,
+          skip_rows_after_header: skip_rows_after_header,
+          row_count_name: row_count_name,
+          row_count_offset: row_count_offset,
+          sample_size: sample_size,
+          eol_char: eol_char
+        )
+      end
+      if new_columns
+        Utils._update_columns(df, new_columns)
+      else
+        df
+      end
+    end
+    def scan_csv(
+      file,
+      has_header: true,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      cache: true,
+      with_column_names: nil,
+      infer_schema_length: 100,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      parse_dates: false,
+      eol_char: "\n"
+    )
+      _check_arg_is_1byte("sep", sep, false)
+      _check_arg_is_1byte("comment_char", comment_char, false)
+      _check_arg_is_1byte("quote_char", quote_char, true)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      LazyFrame._scan_csv(
+        file,
+        has_header: has_header,
+        sep: sep,
+        comment_char: comment_char,
+        quote_char: quote_char,
+        skip_rows: skip_rows,
+        dtypes: dtypes,
+        null_values: null_values,
+        ignore_errors: ignore_errors,
+        cache: cache,
+        with_column_names: with_column_names,
+        infer_schema_length: infer_schema_length,
+        n_rows: n_rows,
+        low_memory: low_memory,
+        rechunk: rechunk,
+        skip_rows_after_header: skip_rows_after_header,
+        encoding: encoding,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        parse_dates: parse_dates,
+        eol_char: eol_char,
+      )
+    end
+    def scan_ipc(
+      file,
+      n_rows: nil,
+      cache: true,
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0,
+      storage_options: nil,
+      memory_map: true
+    )
+      LazyFrame._scan_ipc(
+        file,
+        n_rows: n_rows,
+        cache: cache,
+        rechunk: rechunk,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        storage_options: storage_options,
+        memory_map: memory_map
+      )
+    end
+    def scan_parquet(
+      file,
+      n_rows: nil,
+      cache: true,
+      parallel: "auto",
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0,
+      storage_options: nil,
+      low_memory: false
+    )
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      LazyFrame._scan_parquet(
+        file,
+        n_rows:n_rows,
+        cache: cache,
+        parallel: parallel,
+        rechunk: rechunk,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        storage_options: storage_options,
+        low_memory: low_memory
+      )
+    end
+    def scan_ndjson(
+      file,
+      infer_schema_length: 100,
+      batch_size: 1024,
+      n_rows: nil,
+      low_memory: false,
+      rechunk: true,
+      row_count_name: nil,
+      row_count_offset: 0
+    )
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      LazyFrame._scan_ndjson(
+        file,
+        infer_schema_length: infer_schema_length,
+        batch_size: batch_size,
+        n_rows: n_rows,
+        low_memory: low_memory,
+        rechunk: rechunk,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+      )
+    end
+    # def read_avro
+    # end
+    def read_ipc(
+      file,
+      columns: nil,
+      n_rows: nil,
+      memory_map: true,
+      storage_options: nil,
+      row_count_name: nil,
+      row_count_offset: 0,
+      rechunk: true
+    )
+      storage_options ||= {}
+      _prepare_file_arg(file, **storage_options) do |data|
+        DataFrame._read_ipc(
+          data,
+          columns: columns,
+          n_rows: n_rows,
+          row_count_name: row_count_name,
+          row_count_offset: row_count_offset,
+          rechunk: rechunk,
+          memory_map: memory_map
+        )
       end
     end
@@ -20,6 +257,96 @@ module Polars
       DataFrame._read_ndjson(file)
     end
+    # def read_sql
+    # end
+    # def read_excel
+    # end
+    def read_csv_batched(
+      file,
+      has_header: true,
+      columns: nil,
+      new_columns: nil,
+      sep: ",",
+      comment_char: nil,
+      quote_char: '"',
+      skip_rows: 0,
+      dtypes: nil,
+      null_values: nil,
+      ignore_errors: false,
+      parse_dates: false,
+      n_threads: nil,
+      infer_schema_length: 100,
+      batch_size: 50_000,
+      n_rows: nil,
+      encoding: "utf8",
+      low_memory: false,
+      rechunk: true,
+      skip_rows_after_header: 0,
+      row_count_name: nil,
+      row_count_offset: 0,
+      sample_size: 1024,
+      eol_char: "\n"
+    )
+      projection, columns = Utils.handle_projection_columns(columns)
+      if columns && !has_header
+        columns.each do |column|
+          if !column.start_with?("column_")
+            raise ArgumentError, "Specified column names do not start with \"column_\", but autogenerated header names were requested."
+          end
+        end
+      end
+      if projection || new_columns
+        raise Todo
+      end
+      BatchedCsvReader.new(
+        file,
+        has_header: has_header,
+        columns: columns || projection,
+        sep: sep,
+        comment_char: comment_char,
+        quote_char: quote_char,
+        skip_rows: skip_rows,
+        dtypes: dtypes,
+        null_values: null_values,
+        ignore_errors: ignore_errors,
+        parse_dates: parse_dates,
+        n_threads: n_threads,
+        infer_schema_length: infer_schema_length,
+        batch_size: batch_size,
+        n_rows: n_rows,
+        encoding: encoding == "utf8-lossy" ? encoding : "utf8",
+        low_memory: low_memory,
+        rechunk: rechunk,
+        skip_rows_after_header: skip_rows_after_header,
+        row_count_name: row_count_name,
+        row_count_offset: row_count_offset,
+        sample_size: sample_size,
+        eol_char: eol_char,
+        new_columns: new_columns
+      )
+    end
+    def read_ipc_schema(file)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      _ipc_schema(file)
+    end
+    def read_parquet_schema(file)
+      if file.is_a?(String) || (defined?(Pathname) && file.is_a?(Pathname))
+        file = Utils.format_path(file)
+      end
+      _parquet_schema(file)
+    end
     private
     def _prepare_file_arg(file)
@@ -35,5 +362,18 @@ module Polars
       yield file
     end
+    def _check_arg_is_1byte(arg_name, arg, can_be_empty = false)
+      if arg.is_a?(String)
+        arg_byte_length = arg.bytesize
+        if can_be_empty
+          if arg_byte_length > 1
+            raise ArgumentError, "#{arg_name} should be a single byte character or empty, but is #{arg_byte_length} bytes long."
+          end
+        elsif arg_byte_length != 1
+          raise ArgumentError, "#{arg_name} should be a single byte character, but is #{arg_byte_length} bytes long."
+        end
+      end
+    end
   end
 end