RubyGems - dbx-api - Versions diffs - 0.1.1 → 0.2.0 - Mend

dbx-api 0.1.1 → 0.2.0

Files changed (10) hide show

checksums.yaml +4 -4
data/.rubocop.yml +3 -0
data/CHANGELOG.md +10 -1
data/Gemfile.lock +1 -1
data/README.md +61 -6
data/lib/dbx/databricks/sql.rb +31 -45
data/lib/dbx/databricks/sql_response.rb +113 -0
data/lib/dbx/gateway.rb +4 -1
data/lib/dbx/version.rb +1 -1
metadata +3 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 6f72e4b53309594553a8616e66bd27008585029c0b61d78b4e749c697c964064
-  data.tar.gz: 8bdd6b0e7025486de43a2951a33a0726e94374f3299f9d6e8526d70d424888c7
+  metadata.gz: '09cd7a1984478b2761fbe0dca4b69acd123d664c96ea7333997143fe4389aa3b'
+  data.tar.gz: 5e1093ab32b19c13eff195869d12dc159459f4418f3bc260df241e737cd5f78f
 SHA512:
-  metadata.gz: 2ae01403a40e9688aea026bff939788e83615a017c8333e8c7b50c5e83b88c6f0b7fe0e8244186f07cb47f81757fc801d62980f6eb160e8c71059283c3b7879f
-  data.tar.gz: 131e3d2d072e05fcab4d456f767b87bc8871decea1abab8e11c15c92cfafe6afa5916c4637744553d3bef35eac3fb7b9395b12843bb8d6bdeeffe5e2509e9039
+  metadata.gz: 407516bedbe4fa69d01ad765804aa2593966faebfa966eadac4ed08da44fb41fc7ea055b2be781194e762916784c4cb1d0bb0d4c44b9b3ad8ed23273415277f2
+  data.tar.gz: 569cdbc0465214559dd397e27d143cf3e56fbc96e1f1447c45bd18ba472ef444ec712cf16ae69837b6380de54135c3e42dd1af775f22221954b149c70f249a5c

data/.rubocop.yml CHANGED Viewed

@@ -12,3 +12,6 @@ Style/StringLiteralsInInterpolation:
 Layout/LineLength:
   Max: 120
+Metrics/BlockLength:
+  Enabled: false

data/CHANGELOG.md CHANGED Viewed

@@ -1,5 +1,14 @@
 ## [Unreleased]
-## [0.1.0] - 2023-09-25
+## [0.1.1] - 2023-09-27
+- Yanked because I didn't know what I was doing
+## [0.1.2]
 - Initial release
+## [0.2.0]
+- Added `DatabricksSQLResponse` class
+- `DatabricksGateway::run_sql` now returns an object of type `DatabricksSQLResponse`
+  - results can be accessed by `DatabricksSQLResponse::results`
+  - query success can be accessed by `DatabricksSQLResponse::success?`
+- Added optional `sleep_timer` parameter to `DatabricksGateway`. This is the number of seconds to wait between checking the status of a query. Defaults to 5 seconds.

data/Gemfile.lock CHANGED Viewed

@@ -1,7 +1,7 @@
 PATH
   remote: .
   specs:
-    dbx-api (0.1.0)
+    dbx-api (0.2.0)
       dotenv (~> 2.0)
 GEM

data/README.md CHANGED Viewed

@@ -4,7 +4,10 @@
 This gem is designed to allow access to the DBX APIs (Jobs and SQL) from ruby applications.
 ## Installation
-TODO: write this section
+Add the following to your Gemfile to install
+```ruby
+gem 'dbx-api', '~>0.2.0'
+```
 ## Usage
 Set up your .env file (optional)
@@ -25,17 +28,69 @@ sql_runner = DatabricksGateway.new
 sql_runner = DatabricksGateway.new(host: 'DBX_CONNECTION_STRING', token: 'DBX_ACCESS_TOKEN', warehouse: 'DBX_SQL_WAREHOUSE_ID')
 # Basic sql
-result = sql_runner.run_sql("SELECT 1")
-sql_runner.parse_result(result)
+response = sql_runner.run_sql("SELECT 1")
+response.results
 # => [{"1"=>"1"}]
 # Dummy data in public DBX table
-result = sql_runner.run_sql("SELECT * FROM samples.nyctaxi.trips LIMIT 1")
-sql_runner.parse_result(result)
+response = sql_runner.run_sql("SELECT * FROM samples.nyctaxi.trips LIMIT 1")
+response.results
 # => [{"tpep_pickup_datetime"=>"2016-02-14T16:52:13.000Z",
 #   "tpep_dropoff_datetime"=>"2016-02-14T17:16:04.000Z",
 #   "trip_distance"=>"4.94",
 #   "fare_amount"=>"19.0",
 #   "pickup_zip"=>"10282",
 #   "dropoff_zip"=>"10171"}]
-```
+```
+`run_sql` returns an object of type DatabricksSQLResponse.
+The response object has a few useful methods. For a complete list, see the class definition: `lib/dbx/databricks/sql_response.rb`
+```ruby
+response = sql_runner.run_sql("SELECT 1")
+# checking the status of a response
+response.status # => SUCCEEDED | FAILED | PENDING | RUNNING
+response.failed? # => Boolean
+response.success? # => Boolean
+# getting the results of a response
+response.results # => Array of Hashes
+# looking at the raw response
+response.raw_response # => HTTP object
+# or just the parsed body of the HTTP response
+response.body
+# checking error messages for failed responses
+response.error_message # => String
+```
+This gem does not make an inference to how error handling should occur. `run_sql` always returns an array, even if the query fails (it will return `[]` if status.failed?). Users may wish to check the status of the response before attempting to access the results. For example:
+```ruby
+require 'dbx'
+sql_runner = DatabricksGateway.new
+res = sql_runner.run_sql("SELECT 1")
+# do something with the results if the query succeeded
+return res.results if res.success?
+# do something else if the query failed
+puts "query failed: #{res.error_message}"
+```
+Since `run_sql` returns an instance of `DatabricksSQLResponse`, you can also chain methods together:
+```ruby
+sql_runner.run_sql("SELECT 1").results
+```
+## Development
+- After checking out the repo, run `bin/setup` to install dependencies.
+- Set up your `.env` file as described above.
+- Run `rake spec` to run the rspec tests.
+## Build
+- Run `gem build dbx.gemspec ` to build the gem.
+- Run `gem push dbx-api-0.2.0.gem` to push the gem to rubygems.org
+  - Requires logging in to rubygems.org first via `gem login`

data/lib/dbx/databricks/sql.rb CHANGED Viewed

@@ -1,6 +1,7 @@
 # frozen_string_literal: true
 require "json"
+require_relative "sql_response"
 # This module handles the execution of SQL statements via the DBX API.
 # For more information about the DBX SQL API, see: https://docs.databricks.com/sql/admin/sql-execution-tutorial.html
@@ -30,7 +31,7 @@ module DatabricksSQL
   # POST SQL query to DBX
   def post_sql_request(sql)
     response = http.request(sql_request(sql))
-    response.body
+    DatabricksSQLResponse.new(response)
   end
   # GET request object
@@ -40,71 +41,56 @@ module DatabricksSQL
     Net::HTTP::Get.new(req_uri, request_headers)
   end
-  # GET results of SQL query from DBX.
-  def get_sql_results(http_response)
-    statement_id = JSON.parse(http_response)["statement_id"]
-    response = http.request(sql_results_request(statement_id))
-    puts "#{statement_id}: #{JSON.parse(response.body)["status"]["state"]}"
-    response.body
-  end
   # GET SQL chunk from DBX by internal link
+  # @return [Hash<{"chunk_index" => Number, "row_offset" => Number, "row_count" => Number, "data_array" => Array<Array>}>] # rubocop:disable Layout/LineLength
   def get_sql_chunk(chunk_url)
+    puts "GET chunk: #{chunk_url}"
     request = Net::HTTP::Get.new(chunk_url, request_headers)
     response = http.request(request)
-    response.body
+    DatabricksSQLResponse.new(response)
   end
   # Load additional chunks of data from DBX.
   # DBX returns data with maximum chunk size of 16mb.
-  def load_additional_chunks(results_hash)
-    next_chunk = results_hash["result"]["next_chunk_internal_link"]
+  def load_additional_chunks(response)
+    next_chunk = response.next_chunk
     while next_chunk
-      response = get_sql_chunk(next_chunk)
-      parsed_response = JSON.parse(response)
-      result = parsed_response["data_array"]
-      data = results_hash["result"]["data_array"]
-      results_hash["result"]["data_array"] = [*data, *result]
-      next_chunk = parsed_response["next_chunk_internal_link"]
+      chunk_response = get_sql_chunk(next_chunk)
+      response.add_chunk_to_data(chunk_response)
+      next_chunk = chunk_response.next_chunk
     end
   end
+  # GET results of SQL query from DBX.
+  def get_sql_results(dbx_sql_response)
+    statement_id = dbx_sql_response.statement_id
+    http_response = http.request(sql_results_request(statement_id))
+    response = DatabricksSQLResponse.new(http_response)
+    puts "#{statement_id}: #{response.status}"
+    response
+  end
   # Wait for SQL query response from DBX.
   # Returns a hash of the results of the SQL query.
   def wait_for_sql_response(response)
     result = get_sql_results(response)
-    status = JSON.parse(result)["status"]["state"]
-    # PENDING means the warehouse is starting up
-    # RUNNING means the query is still executing
-    while %w[PENDING RUNNING].include?(status)
-      sleep(5)
-      result = get_sql_results(response)
-      status = JSON.parse(result)["status"]["state"]
-    end
-    JSON.parse(result)
-  end
+    still_running = result.pending?
-  # Parse JSON response from DBX into array of hashes.
-  # Provides output c/w Big Query.
-  def parse_result(http_response)
-    keys = JSON.parse(http_response)["manifest"]["schema"]["columns"]
-    data_array = JSON.parse(http_response)["result"]["data_array"]
-    data_array.map do |row|
-      hash = {}
-      keys.each do |key|
-        hash[key["name"]] = row[key["position"]]
-      end
-      hash
+    while still_running
+      sleep(@sleep_timer)
+      result = get_sql_results(response)
+      still_running = result.pending?
     end
+    result
   end
   # Submit SQL query to DBX and return results.
-  # returns a JSON string of the results of the SQL query
+  # @return [DatabricksSQLResponse]
   def run_sql(sql)
-    response = post_sql_request(sql)
-    results_hash = wait_for_sql_response(response)
-    load_additional_chunks(results_hash) if results_hash["manifest"]["total_chunk_count"] > 1
-    JSON.dump(results_hash)
+    posted_sql = post_sql_request(sql)
+    sql_results = wait_for_sql_response(posted_sql)
+    load_additional_chunks(sql_results) if sql_results.more_chunks?
+    sql_results
   end
 end

data/lib/dbx/databricks/sql_response.rb ADDED Viewed

@@ -0,0 +1,113 @@
+# frozen_string_literal: true
+require "pry"
+# This class represents a response from the Databricks SQL API.
+# It is used by DatabricksSQL to handle http failures and parse the response body.
+class DatabricksSQLResponse
+  def initialize(http_response)
+    self.raw_response = http_response
+    self.body = parse_body
+    self.data_array = extract_data_array
+  end
+  attr_accessor :raw_response, :body, :data_array
+  # -------------------- BODY --------------------
+  # Parse the response body as JSON.
+  def parse_body
+    return {} unless raw_response.is_a?(Net::HTTPSuccess)
+    @body = JSON.parse(raw_response.body)
+  end
+  # Dig out the statement_id from the response body.
+  # @return [String | nil]
+  def statement_id
+    body["statement_id"]
+  end
+  # -------------------- CHUNKS --------------------
+  # Determine if the response contains multiple chunks.
+  def more_chunks?
+    chunk_count = body&.dig("manifest", "total_chunk_count")&.to_i
+    chunk_count && chunk_count > 1
+  end
+  # Dig out the next_chunk_internal_link from the response body.
+  # @return [String | nil]
+  def next_chunk
+    body.dig("result", "next_chunk_internal_link")
+  end
+  # Combine the data from the chunk response into the data from the original response.
+  # @return [Array]
+  def add_chunk_to_data(chunk_response)
+    chunk_data_array = chunk_response.data_array
+    self.data_array = [*data_array, *chunk_data_array]
+  end
+  # -------------------- STATUS --------------------
+  # Determine if the response from the API has succeeded.
+  def success?
+    status == "SUCCEEDED"
+  end
+  # Determine if the response from the API is still executing.
+  # PENDING means the warehouse is starting up
+  # RUNNING means the query is still executing
+  def pending?
+    %w[PENDING RUNNING].include?(status)
+  end
+  # Determine if the response from the API has failed.
+  def failed?
+    status == "FAILED"
+  end
+  # Dig out the error message from the response body.
+  # @return [String | nil]
+  def error_message
+    body.dig("status", "error", "message")
+  end
+  # Dig out the status of the query from the response body.
+  # @return [String]
+  def status
+    return "FAILED" unless raw_response.is_a?(Net::HTTPSuccess)
+    body.dig("status", "state")
+  end
+  # ------------------- RESULTS --------------------
+  # Dig out the columns array from the response body.
+  # @return [Array<String>]
+  def columns
+    body.dig("manifest", "schema", "columns") || []
+  end
+  # Dig out values array for the queried data.
+  # Chunks have a simpler hash structure than initial SQL responses.
+  # @return [Array<Array>]
+  def extract_data_array
+    body.dig("result", "data_array") || body["data_array"] || []
+  end
+  # Return the results of the query as an array of hashes.
+  # @return [Array<Hash>]
+  def results
+    return [] if failed?
+    data_array.map do |row|
+      hash = {}
+      columns.each do |column|
+        hash[column["name"]] = row[column["position"]]
+      end
+      hash
+    end
+  end
+end

data/lib/dbx/gateway.rb CHANGED Viewed

@@ -6,11 +6,13 @@ require_relative "databricks/databricks"
 # This class is a gateway to the Databricks API.
 # https://docs.databricks.com/api-explorer/workspace/introduction
 class DatabricksGateway
-  def initialize(host: ENV.fetch("DBX_HOST", nil), token: ENV.fetch("DBX_TOKEN", nil), warehouse: ENV.fetch("DBX_WAREHOUSE_ID", nil))
+  def initialize(host: ENV.fetch("DBX_HOST", nil), token: ENV.fetch("DBX_TOKEN", nil),
+                 warehouse: ENV.fetch("DBX_WAREHOUSE_ID", nil), sleep_timer: 5)
     @base_url = host
     @uri = URI(@base_url)
     @token = token
     @warehouse = warehouse
+    @sleep_timer = sleep_timer
   end
   # HTTP request headers
@@ -23,6 +25,7 @@ class DatabricksGateway
   end
   # HTTP connection object
+  # @return [Net::HTTP]
   def http
     http = Net::HTTP.new(@uri.host, @uri.port)
     http.use_ssl = true

data/lib/dbx/version.rb CHANGED Viewed

@@ -1,5 +1,5 @@
 # frozen_string_literal: true
 module Dbx
-  VERSION = "0.1.1"
+  VERSION = "0.2.0"
 end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: dbx-api
 version: !ruby/object:Gem::Version
-  version: 0.1.1
+  version: 0.2.0
 platform: ruby
 authors:
 - cmmille
 autorequire:
 bindir: exe
 cert_chain: []
-date: 2023-09-27 00:00:00.000000000 Z
+date: 2023-10-06 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: dotenv
@@ -45,6 +45,7 @@ files:
 - lib/dbx/databricks/databricks.rb
 - lib/dbx/databricks/jobs.rb
 - lib/dbx/databricks/sql.rb
+- lib/dbx/databricks/sql_response.rb
 - lib/dbx/gateway.rb
 - lib/dbx/version.rb
 - sig/dbx.rbs