RubyGems - spark-connect - Versions diffs - 0.2.0 - Mend

spark-connect 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (53) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +82 -0
data/LICENSE +202 -0
data/NOTICE +16 -0
data/README.md +166 -0
data/lib/spark-connect.rb +5 -0
data/lib/spark_connect/arrow.rb +115 -0
data/lib/spark_connect/catalog.rb +190 -0
data/lib/spark_connect/channel_builder.rb +134 -0
data/lib/spark_connect/client.rb +264 -0
data/lib/spark_connect/column.rb +379 -0
data/lib/spark_connect/conf.rb +79 -0
data/lib/spark_connect/data_frame.rb +828 -0
data/lib/spark_connect/errors.rb +58 -0
data/lib/spark_connect/functions.rb +903 -0
data/lib/spark_connect/grouped_data.rb +101 -0
data/lib/spark_connect/na_functions.rb +98 -0
data/lib/spark_connect/observation.rb +61 -0
data/lib/spark_connect/pipelines.rb +221 -0
data/lib/spark_connect/plan.rb +39 -0
data/lib/spark_connect/proto/spark/connect/base_pb.rb +118 -0
data/lib/spark_connect/proto/spark/connect/base_services_pb.rb +82 -0
data/lib/spark_connect/proto/spark/connect/catalog_pb.rb +46 -0
data/lib/spark_connect/proto/spark/connect/commands_pb.rb +67 -0
data/lib/spark_connect/proto/spark/connect/common_pb.rb +32 -0
data/lib/spark_connect/proto/spark/connect/expressions_pb.rb +63 -0
data/lib/spark_connect/proto/spark/connect/ml_common_pb.rb +22 -0
data/lib/spark_connect/proto/spark/connect/ml_pb.rb +32 -0
data/lib/spark_connect/proto/spark/connect/pipelines_pb.rb +45 -0
data/lib/spark_connect/proto/spark/connect/relations_pb.rb +102 -0
data/lib/spark_connect/proto/spark/connect/types_pb.rb +46 -0
data/lib/spark_connect/proto.rb +32 -0
data/lib/spark_connect/reader.rb +98 -0
data/lib/spark_connect/row.rb +105 -0
data/lib/spark_connect/session.rb +317 -0
data/lib/spark_connect/stat_functions.rb +109 -0
data/lib/spark_connect/streaming.rb +351 -0
data/lib/spark_connect/types.rb +490 -0
data/lib/spark_connect/version.rb +11 -0
data/lib/spark_connect/window.rb +119 -0
data/lib/spark_connect/writer.rb +208 -0
data/lib/spark_connect.rb +58 -0
data/proto/spark/connect/base.proto +1275 -0
data/proto/spark/connect/catalog.proto +243 -0
data/proto/spark/connect/commands.proto +553 -0
data/proto/spark/connect/common.proto +179 -0
data/proto/spark/connect/expressions.proto +557 -0
data/proto/spark/connect/ml.proto +147 -0
data/proto/spark/connect/ml_common.proto +64 -0
data/proto/spark/connect/pipelines.proto +307 -0
data/proto/spark/connect/relations.proto +1252 -0
data/proto/spark/connect/types.proto +227 -0
metadata +149 -0

data/lib/spark_connect/grouped_data.rb ADDED Viewed

@@ -0,0 +1,101 @@
+# frozen_string_literal: true
+module SparkConnect
+  # The result of {DataFrame#group_by} / {DataFrame#rollup} / {DataFrame#cube}.
+  # Call an aggregate ({#agg}, {#count}, {#sum}, {#avg}, {#max}, {#min}, ...) to
+  # produce a new {DataFrame}, optionally after {#pivot}.
+  #
+  # @example
+  #   df.group_by("dept").agg(F.avg("salary").alias("avg_salary"), F.count("*"))
+  #   df.group_by("dept").pivot("year").sum("revenue")
+  class GroupedData
+    Proto = SparkConnect::Proto
+    # @param df [DataFrame]
+    # @param grouping [Array<Column>] grouping columns.
+    # @param group_type [Symbol] a `GROUP_TYPE_*` enum symbol.
+    # @param pivot_col [Column, nil]
+    # @param pivot_values [Array, nil]
+    def initialize(df, grouping, group_type, pivot_col: nil, pivot_values: nil)
+      @df = df
+      @grouping = grouping
+      @group_type = group_type
+      @pivot_col = pivot_col
+      @pivot_values = pivot_values
+    end
+    # Compute aggregate expressions.
+    #
+    # @overload agg(*columns)
+    #   @param columns [Array<Column>] aggregate columns, e.g. `F.sum("x")`.
+    # @overload agg(hash)
+    #   @param hash [Hash{String=>String}] column-to-function map, e.g.
+    #     `{"age" => "max", "salary" => "avg"}`.
+    # @return [DataFrame]
+    def agg(*exprs)
+      agg_exprs =
+        if exprs.size == 1 && exprs.first.is_a?(Hash)
+          exprs.first.map { |col, fn| Column.invoke(fn.to_s, Functions.col(col.to_s)).to_expr }
+        else
+          exprs.flatten.map { |c| Column.to_col(c).to_expr }
+        end
+      build(agg_exprs)
+    end
+    # Count rows per group.
+    # @return [DataFrame]
+    def count
+      build([Column.invoke("count", Column.lit(1)).alias("count").to_expr])
+    end
+    # Sum of each numeric column (or all numeric columns when none given).
+    # @return [DataFrame]
+    def sum(*cols) = numeric_agg("sum", cols)
+    # Mean of each numeric column.
+    # @return [DataFrame]
+    def avg(*cols) = numeric_agg("avg", cols)
+    alias mean avg
+    # Maximum of each column.
+    # @return [DataFrame]
+    def max(*cols) = numeric_agg("max", cols)
+    # Minimum of each column.
+    # @return [DataFrame]
+    def min(*cols) = numeric_agg("min", cols)
+    # Pivot a column into multiple output columns.
+    #
+    # @param pivot_col [String, Column]
+    # @param values [Array, nil] optional explicit pivot values (faster, deterministic).
+    # @return [GroupedData]
+    def pivot(pivot_col, values = nil)
+      GroupedData.new(@df, @grouping, :GROUP_TYPE_PIVOT,
+                      pivot_col: Column.to_col(pivot_col.is_a?(String) ? Functions.col(pivot_col) : pivot_col),
+                      pivot_values: values)
+    end
+    private
+    def numeric_agg(fn, cols)
+      exprs = cols.flatten.map { |c| Column.invoke(fn, Functions.col(c.to_s)).to_expr }
+      build(exprs)
+    end
+    def build(agg_exprs)
+      agg = Proto::Aggregate.new(
+        input: @df.relation,
+        group_type: @group_type,
+        grouping_expressions: @grouping.map(&:to_expr),
+        aggregate_expressions: agg_exprs
+      )
+      if @group_type == :GROUP_TYPE_PIVOT
+        pivot = Proto::Aggregate::Pivot.new(col: @pivot_col.to_expr)
+        pivot.values += @pivot_values.map { |v| Column.lit(v).to_expr.literal } if @pivot_values
+        agg.pivot = pivot
+      end
+      @df.build(aggregate: agg)
+    end
+  end
+end

data/lib/spark_connect/na_functions.rb ADDED Viewed

@@ -0,0 +1,98 @@
+# frozen_string_literal: true
+module SparkConnect
+  # Missing-data helpers, returned by {DataFrame#na}. Mirrors PySpark's
+  # `DataFrame.na` (`DataFrameNaFunctions`).
+  #
+  # @example
+  #   df.na.drop(how: :any)
+  #   df.na.fill(0)
+  #   df.na.fill({ "name" => "unknown", "age" => 0 })
+  #   df.na.replace("UNKNOWN", nil, subset: ["name"])
+  class DataFrameNaFunctions
+    Proto = SparkConnect::Proto
+    # @param df [DataFrame]
+    def initialize(df)
+      @df = df
+    end
+    # Drop rows containing null values.
+    #
+    # @param how [Symbol] `:any` (drop if any field is null) or `:all`.
+    # @param thresh [Integer, nil] keep rows with at least this many non-null
+    #   values (overrides `how` when given).
+    # @param subset [Array<String>, nil] only consider these columns.
+    # @return [DataFrame]
+    def drop(how: :any, thresh: nil, subset: nil)
+      cols = Array(subset).map(&:to_s)
+      min_non_nulls = thresh || (if how.to_sym == :all
+                                   1
+                                 else
+                                   (cols.empty? ? nil : cols.size)
+                                 end)
+      nd = Proto::NADrop.new(input: @df.relation, cols: cols)
+      nd.min_non_nulls = min_non_nulls if min_non_nulls
+      @df.build(drop_na: nd)
+    end
+    # Replace null values.
+    #
+    # @overload fill(value, subset: nil)
+    #   @param value [Object] a scalar used to fill all (or `subset`) columns.
+    # @overload fill(value_map)
+    #   @param value_map [Hash{String=>Object}] per-column fill values.
+    # @return [DataFrame]
+    def fill(value, subset: nil)
+      cols, values =
+        if value.is_a?(Hash)
+          [value.keys.map(&:to_s), value.values]
+        else
+          [Array(subset).map(&:to_s), Array(subset).empty? ? [value] : Array(subset).map { value }]
+        end
+      nf = Proto::NAFill.new(
+        input: @df.relation, cols: cols, values: values.map { |v| na_literal(v) }
+      )
+      @df.build(fill_na: nf)
+    end
+    # Replace specific values with others.
+    #
+    # @param to_replace [Object, Array, Hash] value(s) to replace, or a
+    #   `{old => new}` mapping.
+    # @param value [Object, Array, nil] replacement value(s) when `to_replace`
+    #   is not a Hash.
+    # @param subset [Array<String>, nil]
+    # @return [DataFrame]
+    def replace(to_replace, value = nil, subset: nil)
+      mapping =
+        if to_replace.is_a?(Hash)
+          to_replace
+        else
+          Array(to_replace).zip(Array(value)).to_h
+        end
+      replacements = mapping.map do |old, new_value|
+        Proto::NAReplace::Replacement.new(
+          old_value: na_literal(old), new_value: na_literal(new_value)
+        )
+      end
+      nr = Proto::NAReplace.new(
+        input: @df.relation, cols: Array(subset).map(&:to_s), replacements: replacements
+      )
+      @df.build(replace: nr)
+    end
+    private
+    # Spark's fill/replace handlers only accept Long, Double, String, or Boolean
+    # literal values (not 32-bit Int), so widen Ruby Integers to Long and
+    # Floats to Double.
+    def na_literal(value)
+      case value
+      when Integer then Proto::Expression::Literal.new(long: value)
+      when Float then Proto::Expression::Literal.new(double: value)
+      else Column.to_literal(value)
+      end
+    end
+  end
+end

data/lib/spark_connect/observation.rb ADDED Viewed

@@ -0,0 +1,61 @@
+# frozen_string_literal: true
+module SparkConnect
+  # Captures named aggregate metrics computed while a {DataFrame} is being
+  # materialised, without an extra pass over the data. Pair with
+  # {DataFrame#observe}.
+  #
+  # @example
+  #   obs = SparkConnect::Observation.new("metrics")
+  #   df.observe(obs, F.count(F.lit(1)).alias("rows"), F.max("id").alias("max_id")).collect
+  #   obs.get  #=> {"rows"=>100, "max_id"=>99}
+  class Observation
+    # @return [String] the observation name.
+    attr_reader :name
+    @counter = 0
+    class << self
+      # @api private
+      attr_accessor :counter
+    end
+    # @param name [String, nil] a unique name (auto-generated when omitted).
+    def initialize(name = nil)
+      Observation.counter += 1
+      @name = name || "observation_#{Observation.counter}"
+      @df = nil
+    end
+    # @api private - bind the observed DataFrame so {#get} can fetch metrics.
+    def bind(df)
+      @df = df
+      self
+    end
+    # The observed metric values (forces execution if not yet materialised).
+    #
+    # @return [Hash{String=>Object}]
+    def get
+      raise IllegalArgumentError, "Observation has not been attached to a DataFrame yet" unless @df
+      @metrics ||= fetch_metrics
+    end
+    private
+    def fetch_metrics
+      result = @df.session.client.execute_plan(@df.relation)
+      observed = result.observed_metrics.find { |m| m.name == @name } || result.observed_metrics.first
+      return {} unless observed
+      keys = observed.keys.to_a
+      decoded = observed.values.map { |lit| decode_literal(lit) }
+      keys.zip(decoded).to_h
+    end
+    def decode_literal(literal)
+      kind = literal.literal_type
+      kind ? literal.public_send(kind) : nil
+    end
+  end
+end

data/lib/spark_connect/pipelines.rb ADDED Viewed

@@ -0,0 +1,221 @@
+# frozen_string_literal: true
+module SparkConnect
+  # A timestamped event emitted by the server during a pipeline run.
+  PipelineEvent = Struct.new(:timestamp, :message)
+  # A Spark Declarative Pipeline (SDP) dataflow graph.
+  #
+  # A pipeline is built by registering **outputs** (tables, materialized views,
+  # temporary views, or sinks) and the **flows** that populate them, then
+  # started with {#start_run}. Each flow is defined by a {DataFrame} (an
+  # unresolved relation), so you compose flows with the same API you use for
+  # ordinary queries.
+  #
+  # Create one with {SparkSession#pipeline}.
+  #
+  # @example
+  #   pipe = spark.pipeline(storage: "/tmp/pipeline_storage")
+  #   pipe.create_materialized_view("bronze", spark.read.json("/data/raw"))
+  #   pipe.create_table("silver", pipe.read("bronze").filter(F.col("ok")))
+  #   events = pipe.start_run
+  #
+  # @note `foreach`/`foreachBatch` flows and Python query-function evaluation are
+  #   not supported (they require UDFs); define each flow with a relation instead.
+  class Pipeline
+    Proto = SparkConnect::Proto
+    PC = Proto::PipelineCommand
+    OUTPUT_TYPES = {
+      table: :TABLE,
+      materialized_view: :MATERIALIZED_VIEW,
+      temporary_view: :TEMPORARY_VIEW,
+      sink: :SINK,
+    }.freeze
+    # @return [String] the server-assigned dataflow graph id.
+    attr_reader :graph_id
+    # @param session [SparkSession]
+    # @param default_catalog [String, nil]
+    # @param default_database [String, nil]
+    # @param sql_conf [Hash{String=>String}]
+    def initialize(session, default_catalog: nil, default_database: nil, sql_conf: {})
+      @session = session
+      cmd = PC::CreateDataflowGraph.new(sql_conf: stringify(sql_conf))
+      cmd.default_catalog = default_catalog if default_catalog
+      cmd.default_database = default_database if default_database
+      result = dispatch(PC.new(create_dataflow_graph: cmd))
+      @graph_id = result.pipeline_command_result.create_dataflow_graph_result.dataflow_graph_id
+    end
+    # Reference a dataset defined in this pipeline as a {DataFrame} (so later
+    # flows can read from earlier outputs).
+    #
+    # @param name [String]
+    # @return [DataFrame]
+    def read(name)
+      @session.read.table(name)
+    end
+    # Define a published table and the flow that populates it.
+    #
+    # @param name [String]
+    # @param df [DataFrame, nil] the query that populates the table (a flow).
+    # @return [String] the resolved output identifier.
+    def create_table(name, df = nil, comment: nil, format: nil, partition_cols: [],
+                     clustering_columns: [], table_properties: {}, schema: nil)
+      define_table_output(name, :table, df, comment: comment, format: format,
+                                            partition_cols: partition_cols, clustering_columns: clustering_columns,
+                                            table_properties: table_properties, schema: schema)
+    end
+    # Define a materialized view and the flow that populates it.
+    # @return [String]
+    def create_materialized_view(name, df = nil, comment: nil, format: nil, partition_cols: [],
+                                 clustering_columns: [], table_properties: {}, schema: nil)
+      define_table_output(name, :materialized_view, df, comment: comment, format: format,
+                                                        partition_cols: partition_cols, clustering_columns: clustering_columns,
+                                                        table_properties: table_properties, schema: schema)
+    end
+    # Define a (non-published) temporary view and its flow.
+    # @return [String]
+    def create_temporary_view(name, df = nil, comment: nil)
+      define_table_output(name, :temporary_view, df, comment: comment)
+    end
+    # Define a streaming sink.
+    #
+    # @param name [String]
+    # @param df [DataFrame] the flow feeding the sink.
+    # @param format [String, nil]
+    # @param options [Hash{String=>String}]
+    # @return [String]
+    def create_sink(name, df, format: nil, options: {})
+      sink = PC::DefineOutput::SinkDetails.new(options: stringify(options))
+      sink.format = format if format
+      define_output(name, :sink, sink_details: sink)
+      define_flow(name, df, target: name)
+      name
+    end
+    # Define a flow that writes the contents of `df` into `target`.
+    #
+    # @param name [String] the flow name.
+    # @param df [DataFrame]
+    # @param target [String] the dataset the flow writes to (defaults to `name`).
+    # @param once [Boolean] define as a one-time (batch) flow.
+    # @param sql_conf [Hash{String=>String}]
+    # @return [String] the resolved flow name.
+    def define_flow(name, df, target: nil, once: false, sql_conf: {})
+      flow = PC::DefineFlow.new(
+        dataflow_graph_id: @graph_id, flow_name: name.to_s, target_dataset_name: (target || name).to_s,
+        sql_conf: stringify(sql_conf),
+        relation_flow_details: PC::DefineFlow::WriteRelationFlowDetails.new(relation: df.relation)
+      )
+      # `once` is optional: only set it when true, since the server rejects the
+      # option being present at all for non-one-time flows (e.g. MV flows).
+      flow.once = true if once
+      result = dispatch(PC.new(define_flow: flow))
+      identifier_string(result.pipeline_command_result&.define_flow_result&.resolved_identifier) || name.to_s
+    end
+    # Register datasets and flows from a SQL definition file.
+    #
+    # @param sql_text [String] the SQL source.
+    # @param sql_file_path [String, nil]
+    # @return [void]
+    def define_sql(sql_text, sql_file_path: nil)
+      el = PC::DefineSqlGraphElements.new(dataflow_graph_id: @graph_id, sql_text: sql_text.to_s)
+      el.sql_file_path = sql_file_path if sql_file_path
+      dispatch(PC.new(define_sql_graph_elements: el))
+      nil
+    end
+    # Resolve the graph and run a pipeline update. Blocks until the update
+    # completes, returning the events emitted during the run.
+    #
+    # @param full_refresh [Array<String>] datasets to reset and recompute.
+    # @param full_refresh_all [Boolean] reset and recompute everything.
+    # @param refresh [Array<String>] datasets to update.
+    # @param dry [Boolean] validate the graph without executing flows.
+    # @param storage [String, nil] checkpoint/metadata storage location.
+    # @yieldparam event [PipelineEvent] each event, as it is collected
+    # @return [Array<PipelineEvent>]
+    def start_run(full_refresh: [], full_refresh_all: false, refresh: [], dry: false, storage: nil, &block)
+      run = PC::StartRun.new(
+        dataflow_graph_id: @graph_id,
+        full_refresh_selection: Array(full_refresh).map(&:to_s),
+        full_refresh_all: full_refresh_all,
+        refresh_selection: Array(refresh).map(&:to_s),
+        dry: dry
+      )
+      run.storage = storage if storage
+      result = dispatch(PC.new(start_run: run))
+      events = result.pipeline_events.map { |e| PipelineEvent.new(e.timestamp, e.message) }
+      events.each(&block) if block
+      events
+    end
+    # Drop this dataflow graph and stop any attached flows.
+    # @return [void]
+    def drop
+      dispatch(PC.new(drop_dataflow_graph: PC::DropDataflowGraph.new(dataflow_graph_id: @graph_id)))
+      nil
+    end
+    private
+    def define_table_output(name, type, df, comment: nil, format: nil, partition_cols: [],
+                            clustering_columns: [], table_properties: {}, schema: nil)
+      details = nil
+      if type != :temporary_view || format || schema
+        details = PC::DefineOutput::TableDetails.new(
+          table_properties: stringify(table_properties),
+          partition_cols: Array(partition_cols).map(&:to_s),
+          clustering_columns: Array(clustering_columns).map(&:to_s)
+        )
+        details.format = format if format
+        apply_schema(details, schema) if schema
+      end
+      resolved = define_output(name, type, table_details: details, comment: comment)
+      define_flow(name, df, target: name) if df
+      resolved
+    end
+    def define_output(name, type, table_details: nil, sink_details: nil, comment: nil)
+      output = PC::DefineOutput.new(
+        dataflow_graph_id: @graph_id, output_name: name.to_s, output_type: OUTPUT_TYPES.fetch(type)
+      )
+      output.comment = comment if comment
+      output.table_details = table_details if table_details
+      output.sink_details = sink_details if sink_details
+      result = dispatch(PC.new(define_output: output))
+      identifier_string(result.pipeline_command_result&.define_output_result&.resolved_identifier) || name.to_s
+    end
+    def apply_schema(details, schema)
+      if schema.is_a?(Types::DataType)
+        details.schema_data_type = schema.to_proto
+      else
+        details.schema_string = schema.to_s
+      end
+    end
+    def dispatch(pipeline_command)
+      @session.client.execute_command(Proto::Command.new(pipeline_command: pipeline_command))
+    end
+    def identifier_string(resolved)
+      return nil unless resolved
+      parts = [resolved.catalog_name, *resolved.namespace, resolved.table_name].reject { |p| p.nil? || p.empty? }
+      parts.join(".")
+    end
+    def stringify(hash)
+      hash.to_h { |k, v| [k.to_s, v.to_s] }
+    end
+  end
+end

data/lib/spark_connect/plan.rb ADDED Viewed

@@ -0,0 +1,39 @@
+# frozen_string_literal: true
+module SparkConnect
+  # Low-level helpers for assembling the protobuf logical plan that the client
+  # sends to the server. {DataFrame} and {SparkSession} build relations through
+  # these helpers so that every relation carries a unique `plan_id` (used by the
+  # server to resolve columns to a specific subtree, e.g. for self-joins).
+  module PlanBuilder
+    Proto = SparkConnect::Proto
+    module_function
+    # Wrap a `rel_type` oneof keyword into a {Spark::Connect::Relation},
+    # attaching a fresh `plan_id` from `id_source`.
+    #
+    # @param id_source [#next_plan_id] usually a {SparkSession}.
+    # @param rel [Hash] exactly one `rel_type` keyword, e.g. `project:`.
+    # @return [Spark::Connect::Relation]
+    def relation(id_source, **rel)
+      Proto::Relation.new(common: Proto::RelationCommon.new(plan_id: id_source.next_plan_id), **rel)
+    end
+    # Wrap a relation as the root of an executable {Spark::Connect::Plan}.
+    #
+    # @param relation [Spark::Connect::Relation]
+    # @return [Spark::Connect::Plan]
+    def root_plan(relation)
+      Proto::Plan.new(root: relation)
+    end
+    # Wrap a command as an executable {Spark::Connect::Plan}.
+    #
+    # @param command [Spark::Connect::Command]
+    # @return [Spark::Connect::Plan]
+    def command_plan(command)
+      Proto::Plan.new(command: command)
+    end
+  end
+end