data_drain 0.2.2 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,10 @@ module DataDrain
9
9
  :aws_access_key_id, :aws_secret_access_key,
10
10
  :db_host, :db_port, :db_user, :db_pass, :db_name,
11
11
  :batch_size, :throttle_delay, :logger, :limit_ram, :tmp_directory,
12
- :idle_in_transaction_session_timeout
12
+ :idle_in_transaction_session_timeout,
13
+ :vacuum_after_purge,
14
+ :slow_batch_threshold_s,
15
+ :slow_batch_alert_after
13
16
 
14
17
  def initialize
15
18
  @storage_mode = :local
@@ -20,6 +23,9 @@ module DataDrain
20
23
  @limit_ram = nil # eg 2GB
21
24
  @tmp_directory = nil # eg /tmp/duckdb_work
22
25
  @idle_in_transaction_session_timeout = 0
26
+ @vacuum_after_purge = false
27
+ @slow_batch_threshold_s = 30
28
+ @slow_batch_alert_after = 5
23
29
  @logger = Logger.new($stdout)
24
30
  end
25
31
 
@@ -5,12 +5,12 @@ require "pg"
5
5
 
6
6
  module DataDrain
7
7
  # Motor principal de extracción y purga de datos (DataDrain).
8
- # rubocop:disable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
9
8
  #
10
9
  # Orquesta el flujo ETL desde PostgreSQL hacia un Data Lake analítico
11
10
  # delegando la interacción del almacenamiento al adaptador configurado.
12
11
  class Engine
13
12
  include Observability
13
+ include Observability::Timing
14
14
  # Inicializa una nueva instancia del motor de extracción.
15
15
  #
16
16
  # @param options [Hash] Diccionario de configuración para la extracción.
@@ -50,70 +50,92 @@ module DataDrain
50
50
  @duckdb = database.connect
51
51
  end
52
52
 
53
- # Ejecuta el flujo completo del motor: Setup, Conteo, Exportación (opcional), Verificación y Purga.
54
- #
55
- # @return [Boolean] `true` si el proceso finalizó con éxito, `false` si falló la integridad.
53
+ # @return [Boolean] true si el flujo completó exitosamente, false si falló
56
54
  def call
57
- start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
58
- safe_log(:info, "engine.start",
59
- { table: @table_name, start_date: @start_date.to_date, end_date: @end_date.to_date })
55
+ @durations = {}
56
+ start_time = monotonic
57
+ log_start
60
58
 
61
59
  setup_duckdb
60
+ return skip_empty(start_time) if step_count.zero?
62
61
 
63
- # 1. Conteo inicial en Postgres
64
- step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
65
- @pg_count = get_postgres_count
66
- db_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
67
-
68
- if @pg_count.zero?
69
- duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
70
- safe_log(:info, "engine.skip_empty",
71
- { table: @table_name, duration_s: duration.round(2), db_query_duration_s: db_query_duration.round(2) })
72
- return true
73
- end
74
-
75
- # 2. Exportación
76
- export_duration = 0.0
77
62
  if @skip_export
78
63
  safe_log(:info, "engine.skip_export", { table: @table_name })
79
64
  else
80
- safe_log(:info, "engine.export_start", { table: @table_name, count: @pg_count })
81
- step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
82
- export_to_parquet
83
- export_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
65
+ step_export
84
66
  end
67
+ return integrity_failed(start_time) unless step_verify
85
68
 
86
- # 3. Verificación de Integridad
87
- step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
88
- integrity_ok = verify_integrity
89
- integrity_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
69
+ step_purge
70
+ log_complete(start_time)
71
+ true
72
+ end
90
73
 
91
- if integrity_ok
92
- # 4. Purga en Postgres
93
- step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
94
- purge_from_postgres
95
- purge_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
74
+ private
96
75
 
97
- duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
98
- safe_log(:info, "engine.complete", {
99
- table: @table_name,
100
- duration_s: duration.round(2),
101
- db_query_duration_s: db_query_duration.round(2),
102
- export_duration_s: export_duration.round(2),
103
- integrity_duration_s: integrity_duration.round(2),
104
- purge_duration_s: purge_duration.round(2),
105
- count: @pg_count
106
- })
107
- true
108
- else
109
- duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
110
- safe_log(:error, "engine.integrity_error",
111
- { table: @table_name, duration_s: duration.round(2), count: @pg_count })
112
- false
113
- end
76
+ # @api private
77
+ def log_start
78
+ safe_log(:info, "engine.start",
79
+ { table: @table_name, start_date: @start_date.to_date, end_date: @end_date.to_date })
114
80
  end
115
81
 
116
- private
82
+ # @api private
83
+ def step_count
84
+ @pg_count = timed(:db_query) { get_postgres_count }
85
+ @pg_count
86
+ end
87
+
88
+ # @api private
89
+ def skip_empty(start_time)
90
+ duration = monotonic - start_time
91
+ safe_log(:info, "engine.skip_empty", {
92
+ table: @table_name,
93
+ duration_s: duration.round(2),
94
+ db_query_duration_s: @durations.fetch(:db_query, 0).round(2)
95
+ })
96
+ true
97
+ end
98
+
99
+ # @api private
100
+ def step_export
101
+ safe_log(:info, "engine.export_start", { table: @table_name, count: @pg_count })
102
+ timed(:export) { export_to_parquet }
103
+ end
104
+
105
+ # @api private
106
+ def step_verify
107
+ timed(:integrity) { verify_integrity }
108
+ end
109
+
110
+ # @api private
111
+ def step_purge
112
+ timed(:purge) { purge_from_postgres }
113
+ end
114
+
115
+ # @api private
116
+ def log_complete(start_time)
117
+ duration = monotonic - start_time
118
+ safe_log(:info, "engine.complete", {
119
+ table: @table_name,
120
+ duration_s: duration.round(2),
121
+ db_query_duration_s: @durations.fetch(:db_query, 0).round(2),
122
+ export_duration_s: @durations.fetch(:export, 0).round(2),
123
+ integrity_duration_s: @durations.fetch(:integrity, 0).round(2),
124
+ purge_duration_s: @durations.fetch(:purge, 0).round(2),
125
+ count: @pg_count
126
+ })
127
+ end
128
+
129
+ # @api private
130
+ def integrity_failed(start_time)
131
+ duration = monotonic - start_time
132
+ safe_log(:error, "engine.integrity_error", {
133
+ table: @table_name,
134
+ duration_s: duration.round(2),
135
+ count: @pg_count
136
+ })
137
+ false
138
+ end
117
139
 
118
140
  # @api private
119
141
  # @return [String]
@@ -137,7 +159,7 @@ module DataDrain
137
159
  # @api private
138
160
  # @return [Integer]
139
161
  def get_postgres_count
140
- pg_sql = "SELECT COUNT(*) AS row_count FROM public.#{@table_name} WHERE #{base_where_sql}"
162
+ pg_sql = "SELECT count() AS row_count FROM public.#{@table_name} WHERE #{base_where_sql}"
141
163
  pg_sql = pg_sql.gsub("'", "''")
142
164
  query = "SELECT row_count FROM postgres_query('pg_source', '#{pg_sql}')"
143
165
  @duckdb.query(query).first.first
@@ -182,7 +204,7 @@ module DataDrain
182
204
 
183
205
  begin
184
206
  query = <<~SQL
185
- SELECT COUNT(*)
207
+ SELECT count()
186
208
  FROM read_parquet('#{archive_path}')
187
209
  WHERE #{base_where_sql}
188
210
  SQL
@@ -213,40 +235,129 @@ module DataDrain
213
235
  conn.exec("SET idle_in_transaction_session_timeout = #{@config.idle_in_transaction_session_timeout};")
214
236
  end
215
237
 
238
+ total_deleted = purge_loop(conn)
239
+
240
+ vacuum_if_needed(conn, total_deleted)
241
+ ensure
242
+ conn&.close
243
+ end
244
+
245
+ # @api private
246
+ def vacuum_if_needed(conn, total_deleted)
247
+ return unless @config.vacuum_after_purge
248
+ return if total_deleted.zero?
249
+
250
+ vacuum_start = monotonic
251
+ dead_before = fetch_dead_tuple_count(conn)
252
+
253
+ begin
254
+ conn.exec("VACUUM ANALYZE #{@table_name};")
255
+ rescue PG::Error => e
256
+ safe_log(:warn, "engine.vacuum_error", {
257
+ table: @table_name,
258
+ dead_tuples_before: dead_before,
259
+ rows_deleted_count: total_deleted,
260
+ duration_s: (monotonic - vacuum_start).round(2)
261
+ }.merge(exception_metadata(e)))
262
+ return
263
+ end
264
+
265
+ dead_after = fetch_dead_tuple_count(conn)
266
+ vacuum_duration = monotonic - vacuum_start
267
+
268
+ safe_log(:info, "engine.vacuum_complete", {
269
+ table: @table_name,
270
+ duration_s: vacuum_duration.round(2),
271
+ dead_tuples_before: dead_before,
272
+ dead_tuples_after: dead_after,
273
+ rows_deleted_count: total_deleted
274
+ })
275
+ end
276
+
277
+ # @api private
278
+ def fetch_dead_tuple_count(conn)
279
+ result = conn.exec_params(
280
+ "SELECT n_dead_tup FROM pg_stat_user_tables WHERE relname = $1",
281
+ [@table_name]
282
+ )
283
+ result.first&.dig("n_dead_tup")&.to_i || 0
284
+ rescue PG::Error
285
+ -1
286
+ end
287
+
288
+ # @api private
289
+ # @param conn [PG::Connection]
290
+ # @return [Integer] total de filas borradas
291
+ def purge_loop(conn)
216
292
  batches_processed = 0
217
293
  total_deleted = 0
294
+ slow_batch_streak = 0
218
295
 
219
296
  loop do
220
- sql = <<~SQL
221
- DELETE FROM #{@table_name}
222
- WHERE #{@primary_key} IN (
223
- SELECT #{@primary_key} FROM #{@table_name}
224
- WHERE #{base_where_sql}
225
- LIMIT #{@config.batch_size}
226
- )
227
- SQL
228
-
229
- result = conn.exec(sql)
297
+ batch_start = monotonic
298
+ result = conn.exec(build_delete_sql)
299
+ batch_duration = monotonic - batch_start
230
300
  count = result.cmd_tuples
231
301
  break if count.zero?
232
302
 
233
303
  batches_processed += 1
234
304
  total_deleted += count
235
305
 
236
- # Heartbeat cada 100 lotes para monitorear procesos largos de 1TB
237
- if (batches_processed % 100).zero?
238
- safe_log(:info, "engine.purge_heartbeat", {
306
+ slow_batch_streak = handle_batch_timing(batch_duration, count, slow_batch_streak)
307
+ emit_heartbeat_if_due(batches_processed, total_deleted)
308
+
309
+ sleep(@config.throttle_delay) if @config.throttle_delay.positive?
310
+ end
311
+
312
+ total_deleted
313
+ end
314
+
315
+ # @api private
316
+ def handle_batch_timing(batch_duration, count, streak)
317
+ if batch_duration > @config.slow_batch_threshold_s
318
+ streak += 1
319
+ safe_log(:warn, "engine.slow_batch", {
320
+ table: @table_name,
321
+ batch_duration_s: batch_duration.round(2),
322
+ batch_size: count,
323
+ streak: streak,
324
+ threshold_s: @config.slow_batch_threshold_s
325
+ })
326
+
327
+ if streak == @config.slow_batch_alert_after
328
+ safe_log(:warn, "engine.purge_degraded", {
239
329
  table: @table_name,
240
- batches_processed_count: batches_processed,
241
- rows_deleted_count: total_deleted
330
+ consecutive_slow_batches: streak,
331
+ hint: "considerar índice composite o particionamiento (ver postgres-tuning.md)"
242
332
  })
243
333
  end
244
-
245
- sleep(@config.throttle_delay) if @config.throttle_delay.positive?
334
+ streak
335
+ else
336
+ 0
246
337
  end
247
- ensure
248
- conn&.close
338
+ end
339
+
340
+ # @api private
341
+ def emit_heartbeat_if_due(batches_processed, total_deleted)
342
+ return unless (batches_processed % 100).zero?
343
+
344
+ safe_log(:info, "engine.purge_heartbeat", {
345
+ table: @table_name,
346
+ batches_processed_count: batches_processed,
347
+ rows_deleted_count: total_deleted
348
+ })
349
+ end
350
+
351
+ # @api private
352
+ def build_delete_sql
353
+ <<~SQL
354
+ DELETE FROM #{@table_name}
355
+ WHERE #{@primary_key} IN (
356
+ SELECT #{@primary_key} FROM #{@table_name}
357
+ WHERE #{base_where_sql}
358
+ LIMIT #{@config.batch_size}
359
+ )
360
+ SQL
249
361
  end
250
362
  end
251
- # rubocop:enable Metrics/ClassLength, Metrics/AbcSize, Metrics/MethodLength, Naming/AccessorMethodName
252
363
  end
@@ -6,8 +6,7 @@ module DataDrain
6
6
  # aplicando compresión ZSTD y particionamiento Hive.
7
7
  class FileIngestor
8
8
  include Observability
9
- # rubocop:disable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
10
- # Metrics/MethodLength
9
+ include Observability::Timing
11
10
 
12
11
  # @param options [Hash] Opciones de ingestión.
13
12
  # @option options [String] :source_path Ruta absoluta al archivo local.
@@ -36,46 +35,77 @@ module DataDrain
36
35
  # Ejecuta el flujo de ingestión.
37
36
  # @return [Boolean] true si el proceso fue exitoso.
38
37
  def call
39
- start_time = Process.clock_gettime(Process::CLOCK_MONOTONIC)
38
+ @durations = {}
39
+ start_time = monotonic
40
40
  safe_log(:info, "file_ingestor.start", { source_path: @source_path })
41
41
 
42
- unless File.exist?(@source_path)
43
- safe_log(:error, "file_ingestor.file_not_found", { source_path: @source_path })
44
- return false
45
- end
42
+ return file_not_found(start_time) unless step_validate_file
43
+
44
+ step_setup_duckdb
45
+ @reader_function = determine_reader
46
+ @source_count = step_count_source
47
+
48
+ return skip_empty(start_time) if @source_count.zero?
49
+
50
+ step_export
51
+ log_complete(start_time)
52
+ cleanup_local_file
53
+ true
54
+ rescue DuckDB::Error => e
55
+ duration = monotonic - start_time
56
+ safe_log(:error, "file_ingestor.duckdb_error",
57
+ { source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
58
+ false
59
+ ensure
60
+ @duckdb&.close
61
+ end
62
+
63
+ private
64
+
65
+ # @api private
66
+ def file_not_found(_start_time)
67
+ safe_log(:error, "file_ingestor.file_not_found", { source_path: @source_path })
68
+ false
69
+ end
70
+
71
+ # @api private
72
+ def step_validate_file
73
+ File.exist?(@source_path)
74
+ end
46
75
 
76
+ # @api private
77
+ def step_setup_duckdb
47
78
  @duckdb.query("SET max_memory='#{@config.limit_ram}';") if @config.limit_ram.present?
48
79
  @duckdb.query("SET temp_directory='#{@config.tmp_directory}'") if @config.tmp_directory.present?
49
-
50
80
  @adapter.setup_duckdb(@duckdb)
81
+ end
51
82
 
52
- # Determinamos la función lectora de DuckDB según la extensión del archivo
53
- reader_function = determine_reader
54
-
55
- # 1. Conteo de seguridad
56
- step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
57
- source_count = @duckdb.query("SELECT COUNT(*) FROM #{reader_function}").first.first
58
- source_query_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
83
+ # @api private
84
+ def step_count_source
85
+ source_count = timed(:source_query) { @duckdb.query("SELECT count() FROM #{@reader_function}").first.first }
59
86
  safe_log(:info, "file_ingestor.count", {
60
87
  source_path: @source_path,
61
88
  count: source_count,
62
- source_query_duration_s: source_query_duration.round(2)
89
+ source_query_duration_s: @durations.fetch(:source_query, 0).round(2)
63
90
  })
91
+ source_count
92
+ end
64
93
 
65
- if source_count.zero?
66
- cleanup_local_file
67
- duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
68
- safe_log(:info, "file_ingestor.skip_empty", { source_path: @source_path, duration_s: duration.round(2) })
69
- return true
70
- end
94
+ # @api private
95
+ def skip_empty(start_time)
96
+ cleanup_local_file
97
+ duration = monotonic - start_time
98
+ safe_log(:info, "file_ingestor.skip_empty", { source_path: @source_path, duration_s: duration.round(2) })
99
+ true
100
+ end
71
101
 
72
- # 2. Exportación / Subida
102
+ # @api private
103
+ def step_export
73
104
  @adapter.prepare_export_path(@bucket, @folder_name)
74
105
  dest_path = if @config.storage_mode.to_sym == :s3
75
106
  "s3://#{@bucket}/#{@folder_name}/"
76
107
  else
77
- File.join(@bucket,
78
- @folder_name, "")
108
+ File.join(@bucket, @folder_name, "")
79
109
  end
80
110
 
81
111
  partition_clause = @partition_keys.any? ? "PARTITION_BY (#{@partition_keys.join(", ")})," : ""
@@ -83,7 +113,7 @@ module DataDrain
83
113
  query = <<~SQL
84
114
  COPY (
85
115
  SELECT #{@select_sql}
86
- FROM #{reader_function}
116
+ FROM #{@reader_function}
87
117
  ) TO '#{dest_path}'
88
118
  (
89
119
  FORMAT PARQUET,
@@ -94,32 +124,21 @@ module DataDrain
94
124
  SQL
95
125
 
96
126
  safe_log(:info, "file_ingestor.export_start", { dest_path: dest_path })
97
- step_start = Process.clock_gettime(Process::CLOCK_MONOTONIC)
98
- @duckdb.query(query)
99
- export_duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - step_start
127
+ timed(:export) { @duckdb.query(query) }
128
+ end
100
129
 
101
- duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
130
+ # @api private
131
+ def log_complete(start_time)
132
+ duration = monotonic - start_time
102
133
  safe_log(:info, "file_ingestor.complete", {
103
134
  source_path: @source_path,
104
135
  duration_s: duration.round(2),
105
- source_query_duration_s: source_query_duration.round(2),
106
- export_duration_s: export_duration.round(2),
107
- count: source_count
136
+ source_query_duration_s: @durations.fetch(:source_query, 0).round(2),
137
+ export_duration_s: @durations.fetch(:export, 0).round(2),
138
+ count: @source_count
108
139
  })
109
-
110
- cleanup_local_file
111
- true
112
- rescue DuckDB::Error => e
113
- duration = Process.clock_gettime(Process::CLOCK_MONOTONIC) - start_time
114
- safe_log(:error, "file_ingestor.duckdb_error",
115
- { source_path: @source_path }.merge(exception_metadata(e)).merge(duration_s: duration.round(2)))
116
- false
117
- ensure
118
- @duckdb&.close
119
140
  end
120
141
 
121
- private
122
-
123
142
  # @api private
124
143
  def determine_reader
125
144
  case File.extname(@source_path).downcase
@@ -142,6 +161,4 @@ module DataDrain
142
161
  safe_log(:info, "file_ingestor.cleanup", { source_path: @source_path })
143
162
  end
144
163
  end
145
- # rubocop:enable Metrics/AbcSize, Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity,
146
- # Metrics/MethodLength
147
164
  end
@@ -0,0 +1,23 @@
1
+ # frozen_string_literal: true
2
+
3
+ module DataDrain
4
+ module Observability
5
+ # Helper para medición de duración de operaciones.
6
+ # @api private
7
+ module Timing
8
+ private
9
+
10
+ def monotonic
11
+ Process.clock_gettime(Process::CLOCK_MONOTONIC)
12
+ end
13
+
14
+ def timed(step_name)
15
+ t = monotonic
16
+ result = yield
17
+ @durations ||= {}
18
+ @durations[step_name] = monotonic - t
19
+ result
20
+ end
21
+ end
22
+ end
23
+ end
@@ -7,6 +7,8 @@ module DataDrain
7
7
  # Este módulo es genérico y puede ser utilizado en otras gemas.
8
8
  # @api private
9
9
  module Observability
10
+ # Regex para detectar claves sensibles en logs y enmascararlas preventivamente.
11
+ # @!visibility private
10
12
  SENSITIVE_KEY_PATTERN = /password|passwd|pass|secret|token|api_key|apikey|auth|credential|private_key/i
11
13
 
12
14
  private
@@ -46,7 +46,6 @@ module DataDrain
46
46
  # Esto previene tener que recargar extensiones (como httpfs) en cada consulta.
47
47
  #
48
48
  # @return [DuckDB::Connection] Conexión activa a DuckDB.
49
- # rubocop:disable Metrics/AbcSize
50
49
  def self.connection
51
50
  Thread.current[:data_drain_duckdb] ||= begin
52
51
  db = DuckDB::Database.open(":memory:")
@@ -57,11 +56,13 @@ module DataDrain
57
56
  conn.query("SET temp_directory='#{config.tmp_directory}'") if config.tmp_directory.present?
58
57
 
59
58
  DataDrain::Storage.adapter.setup_duckdb(conn)
59
+
60
+ conn.query("SET lock_configuration=true;")
61
+
60
62
  { db: db, conn: conn }
61
63
  end
62
64
  Thread.current[:data_drain_duckdb][:conn]
63
65
  end
64
- # rubocop:enable Metrics/AbcSize
65
66
 
66
67
  # Consulta registros en el Data Lake filtrando por claves de partición.
67
68
  #
@@ -138,22 +139,14 @@ module DataDrain
138
139
  # @param sql [String]
139
140
  # @param columns [Array<String>]
140
141
  # @return [Array<DataDrain::Record>]
141
- # rubocop:disable Metrics/MethodLength
142
142
  def execute_and_instantiate(sql, columns)
143
143
  @logger = DataDrain.configuration.logger
144
- begin
145
- result = connection.query(sql)
146
- rescue DuckDB::Error => e
147
- safe_log(:warn, "record.parquet_not_found", exception_metadata(e))
148
- return []
149
- end
150
-
151
- result.map do |row|
152
- attributes_hash = columns.zip(row).to_h
153
- new(attributes_hash)
154
- end
144
+ result = connection.query(sql)
145
+ result.map { |row| new(columns.zip(row).to_h) }
146
+ rescue DuckDB::Error => e
147
+ safe_log(:warn, "record.parquet_not_found", exception_metadata(e))
148
+ []
155
149
  end
156
150
  end
157
- # rubocop:enable Metrics/MethodLength
158
151
  end
159
152
  end
@@ -54,6 +54,18 @@ module DataDrain
54
54
  def destroy_partitions(bucket, folder_name, partition_keys, partitions)
55
55
  raise NotImplementedError, "#{self.class} debe implementar #destroy_partitions"
56
56
  end
57
+
58
+ protected
59
+
60
+ # @param bucket [String]
61
+ # @param folder_name [String]
62
+ # @param partition_path [String, nil]
63
+ # @return [String] path sin prefix de protocolo ni sufijo glob
64
+ def build_path_base(bucket, folder_name, partition_path)
65
+ base = File.join(bucket, folder_name)
66
+ base = File.join(base, partition_path) if partition_path && !partition_path.empty?
67
+ base
68
+ end
57
69
  end
58
70
  end
59
71
  end
@@ -24,9 +24,7 @@ module DataDrain
24
24
  # @param partition_path [String, nil]
25
25
  # @return [String]
26
26
  def build_path(bucket, folder_name, partition_path)
27
- base = File.join(bucket, folder_name)
28
- base = File.join(base, partition_path) if partition_path && !partition_path.empty?
29
- "#{base}/**/*.parquet"
27
+ "#{build_path_base(bucket, folder_name, partition_path)}/**/*.parquet"
30
28
  end
31
29
 
32
30
  # @param bucket [String]