ductr 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +14 -0
  4. data/.vscode/settings.json +18 -0
  5. data/COPYING +674 -0
  6. data/COPYING.LESSER +165 -0
  7. data/Gemfile +6 -0
  8. data/Gemfile.lock +121 -0
  9. data/README.md +37 -0
  10. data/Rakefile +37 -0
  11. data/bin/console +15 -0
  12. data/bin/setup +8 -0
  13. data/ductr.gemspec +50 -0
  14. data/exe/ductr +24 -0
  15. data/lib/ductr/adapter.rb +94 -0
  16. data/lib/ductr/cli/default.rb +25 -0
  17. data/lib/ductr/cli/main.rb +60 -0
  18. data/lib/ductr/cli/new_project_generator.rb +72 -0
  19. data/lib/ductr/cli/templates/project/bin_ductr.rb +7 -0
  20. data/lib/ductr/cli/templates/project/config_app.rb +5 -0
  21. data/lib/ductr/cli/templates/project/config_development.yml +8 -0
  22. data/lib/ductr/cli/templates/project/config_environment_development.rb +18 -0
  23. data/lib/ductr/cli/templates/project/gemfile.rb +6 -0
  24. data/lib/ductr/cli/templates/project/rubocop.yml +14 -0
  25. data/lib/ductr/cli/templates/project/tool-versions +1 -0
  26. data/lib/ductr/configuration.rb +145 -0
  27. data/lib/ductr/etl/controls/buffered_destination.rb +65 -0
  28. data/lib/ductr/etl/controls/buffered_transform.rb +76 -0
  29. data/lib/ductr/etl/controls/control.rb +46 -0
  30. data/lib/ductr/etl/controls/destination.rb +28 -0
  31. data/lib/ductr/etl/controls/paginated_source.rb +47 -0
  32. data/lib/ductr/etl/controls/source.rb +21 -0
  33. data/lib/ductr/etl/controls/transform.rb +28 -0
  34. data/lib/ductr/etl/fiber_control.rb +136 -0
  35. data/lib/ductr/etl/fiber_runner.rb +68 -0
  36. data/lib/ductr/etl/kiba_runner.rb +26 -0
  37. data/lib/ductr/etl/parser.rb +115 -0
  38. data/lib/ductr/etl/runner.rb +37 -0
  39. data/lib/ductr/etl_job.rb +161 -0
  40. data/lib/ductr/job.rb +58 -0
  41. data/lib/ductr/job_etl_runner.rb +37 -0
  42. data/lib/ductr/job_status.rb +56 -0
  43. data/lib/ductr/kiba_job.rb +130 -0
  44. data/lib/ductr/log/formatters/color_formatter.rb +48 -0
  45. data/lib/ductr/log/logger.rb +169 -0
  46. data/lib/ductr/log/outputs/file_output.rb +30 -0
  47. data/lib/ductr/log/outputs/standard_output.rb +39 -0
  48. data/lib/ductr/pipeline.rb +133 -0
  49. data/lib/ductr/pipeline_runner.rb +95 -0
  50. data/lib/ductr/pipeline_step.rb +92 -0
  51. data/lib/ductr/registry.rb +55 -0
  52. data/lib/ductr/rufus_trigger.rb +106 -0
  53. data/lib/ductr/scheduler.rb +117 -0
  54. data/lib/ductr/store/job_serializer.rb +59 -0
  55. data/lib/ductr/store/job_store.rb +59 -0
  56. data/lib/ductr/store/pipeline_serializer.rb +106 -0
  57. data/lib/ductr/store/pipeline_store.rb +48 -0
  58. data/lib/ductr/store.rb +81 -0
  59. data/lib/ductr/trigger.rb +49 -0
  60. data/lib/ductr/version.rb +6 -0
  61. data/lib/ductr.rb +143 -0
  62. data/sig/ductr.rbs +1107 -0
  63. metadata +292 -0
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "thor"
4
+ require "thor/group"
5
+
6
+ module Ductr
7
+ module CLI
8
+ #
9
+ # Thor generator to create a new project
10
+ #
11
+ class NewProjectGenerator < Thor::Group
12
+ include Thor::Actions
13
+ desc "Generate a new project"
14
+ argument :name, type: :string, optional: true, default: ""
15
+
16
+ #
17
+ # The templates source used to create a new project
18
+ #
19
+ # @return [String] the templates source absolute path
20
+ #
21
+ def self.source_root
22
+ "#{__dir__}/templates/project"
23
+ end
24
+
25
+ #
26
+ # Doing some setup before generating file,
27
+ # creates the project directory and sets it as destination for the generator
28
+ #
29
+ # @return [void]
30
+ #
31
+ def init
32
+ empty_directory name
33
+ self.destination_root = "#{destination_root}/#{name}"
34
+ end
35
+
36
+ #
37
+ # Creates files in the project's root
38
+ #
39
+ # @return [void]
40
+ #
41
+ def gen_root
42
+ copy_file "gemfile.rb", "Gemfile"
43
+ copy_file "rubocop.yml", ".rubocop.yml"
44
+ copy_file "tool-versions", ".tool-versions"
45
+
46
+ create_file "app/jobs/.gitkeep"
47
+ create_file "app/pipelines/.gitkeep"
48
+ create_file "app/schedulers/.gitkeep"
49
+ end
50
+
51
+ #
52
+ # Creates the bin file for the project
53
+ #
54
+ # @return [void]
55
+ #
56
+ def gen_bin
57
+ copy_file "bin_ductr.rb", "bin/ductr"
58
+ end
59
+
60
+ #
61
+ # Creates files in the `config` folder
62
+ #
63
+ # @return [void]
64
+ #
65
+ def gen_config
66
+ copy_file "config_app.rb", "config/app.rb"
67
+ copy_file "config_development.yml", "config/development.yml"
68
+ copy_file "config_environment_development.rb", "config/environment/development.rb"
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "ductr"
5
+ require_relative "../config/app"
6
+
7
+ Ductr::CLI::Main.start
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ductr"
4
+
5
+ require_relative "environment/#{Ductr.env}"
@@ -0,0 +1,8 @@
1
+ # Add your vars here, you can make erb bindings e.g.:
2
+ # example: <%= ENV.fetch("SOME_VAR", "some default value") %>
3
+
4
+ # declare your adapters here
5
+ adapters:
6
+ # my_adapter:
7
+ # adapter: sqlite
8
+ # database: some.db
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+ require "active_support/cache/file_store"
3
+
4
+ Ductr.configure do |config|
5
+ # Store configuration.
6
+ #
7
+ # You can pass an `ActiveSupport::Cache::Store` class or a symbol
8
+ config.store(ActiveSupport::Cache::FileStore, "tmp/store")
9
+
10
+ # Logging configuration.
11
+ #
12
+ # The following logging levels are available:
13
+ # :debug, :info, :warn, :error, :fatal
14
+ config.logging.level = :debug
15
+
16
+ # Append logs to the stdout by default, you can add/replace outputs at will.
17
+ config.logging.add_output(Ductr::Log::StandardOutput, Ductr::Log::ColorFormatter)
18
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gem "annotable", git: "git@gitlab.com:la-manufacture/rocket/annotable.git", branch: "master"
6
+ gem "ductr", git: "git@gitlab.com:la-manufacture/rocket/ductr.git", branch: "main"
@@ -0,0 +1,14 @@
1
+ AllCops:
2
+ TargetRubyVersion: 3.1
3
+ NewCops: enable
4
+
5
+ Style/StringLiterals:
6
+ Enabled: true
7
+ EnforcedStyle: double_quotes
8
+
9
+ Style/StringLiteralsInInterpolation:
10
+ Enabled: true
11
+ EnforcedStyle: double_quotes
12
+
13
+ Layout/LineLength:
14
+ Max: 120
@@ -0,0 +1 @@
1
+ ruby 3.1.0
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "erb"
5
+
6
+ module Ductr
7
+ #
8
+ # Contains the framework's configuration, including:
9
+ # - `DUCTR_ENV` environment variable
10
+ # - `Ductr.configure` block
11
+ # - Project root path
12
+ # - YML file configuration
13
+ #
14
+ class Configuration
15
+ # @return [Struct] The active job configuration, available options are
16
+ # `queue_adapter`, `default_queue_name`, `queue_name_prefix` & `queue_name_delimiter`
17
+ attr_reader :active_job
18
+
19
+ # @return [Class<Ductr::Log::Logger>] The logger constant
20
+ attr_reader :logging
21
+
22
+ # @return [String] The project root
23
+ attr_reader :root
24
+
25
+ # @return [Class<ActiveSupport::Cache::Store>, Symbol] The store adapter to use
26
+ # @see https://edgeapi.rubyonrails.org/classes/ActiveSupport/Cache.html#method-c-lookup_store
27
+ attr_reader :store_adapter
28
+
29
+ # @return [Array] The store adapter config
30
+ attr_reader :store_parameters
31
+
32
+ # @return [Hash] The parsed YML configuration
33
+ attr_reader :yml
34
+
35
+ #
36
+ # Initializing environment to "development" by default, setting project root, parsing YML with the config gem
37
+ # and aliasing semantic logger gem constant to make it usable through the `Ductr.configure` block
38
+ #
39
+ def initialize(env)
40
+ @root = Dir.pwd
41
+ @yml = load_yaml("#{root}/config/#{env}.yml")
42
+
43
+ @logging = Log::Logger
44
+ logging.level = :debug
45
+
46
+ @active_job = Struct.new(:queue_adapter, :default_queue_name, :queue_name_prefix, :queue_name_delimiter).new
47
+ @store_adapter = ActiveSupport::Cache::FileStore
48
+ @store_parameters = ["tmp/store"]
49
+ end
50
+
51
+ #
52
+ # Configures the store instance.
53
+ #
54
+ # @param [Class<ActiveSupport::Cache::Store>, Symbol] adapter The store adapter class
55
+ # @param [Array] *parameters The store adapter configuration
56
+ #
57
+ # @return [void]
58
+ #
59
+ def store(adapter, *parameters)
60
+ @store_adapter = adapter
61
+ @store_parameters = parameters
62
+ end
63
+
64
+ #
65
+ # Memoize configured adapters based on the YAML configuration.
66
+ #
67
+ # @return [Array<Adapter>] The configured Adapter instances
68
+ #
69
+ def adapters
70
+ @adapters ||= yml.adapters.to_h.map do |name, entry|
71
+ adapter_class = Ductr.adapter_registry.find(entry.adapter)
72
+ config = entry.to_h.except(:adapter)
73
+
74
+ adapter_class.new(name, **config)
75
+ end
76
+ end
77
+
78
+ #
79
+ # Find an adapter based on its name.
80
+ #
81
+ # @param [Symbol] name The name of the adapter to find
82
+ #
83
+ # @raise [AdapterNotFoundError] If no adapter match the given name
84
+ # @return [Adapter] The adapter found
85
+ #
86
+ def adapter(name)
87
+ not_found_error = -> { raise AdapterNotFoundError, "The adapter named \"#{name}\" does not exist" }
88
+
89
+ adapters.find(not_found_error) do |adapter|
90
+ adapter.name == name
91
+ end
92
+ end
93
+
94
+ #
95
+ # Configures active job with the given options.
96
+ #
97
+ # @return [void]
98
+ #
99
+ def apply_active_job_config
100
+ ActiveJob::Base.logger = logging.new("ActiveJob")
101
+
102
+ active_job.each_pair do |opt, value|
103
+ next unless value
104
+
105
+ ActiveJob::Base.send("#{opt}=", value)
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ #
112
+ # Load YAML configuration localized at given path.
113
+ # Parse the file with ERB before parsing YAML, so we can use env vars in config files.
114
+ #
115
+ # @param [String] path The path of the YAML file to load
116
+ #
117
+ # @return [Struct] The parsed YAML configuration
118
+ #
119
+ def load_yaml(path)
120
+ return {} unless path && File.exist?(path)
121
+
122
+ erb = ERB.new File.read(path)
123
+ yaml = YAML.load(erb.result, symbolize_names: true)
124
+
125
+ hash_to_struct(yaml)
126
+ end
127
+
128
+ #
129
+ # Recursively convert Hash into Struct.
130
+ #
131
+ # @param [Hash] hash The hash to convert
132
+ #
133
+ # @return [Struct] The converted hash
134
+ #
135
+ def hash_to_struct(hash)
136
+ values = hash.values.map do |value|
137
+ next hash_to_struct(value) if value.is_a?(Hash)
138
+
139
+ value
140
+ end
141
+
142
+ Struct.new(*hash.keys).new(*values)
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class to implement buffered destinations.
7
+ #
8
+ class BufferedDestination < Destination
9
+ # @return [Array] The row buffer
10
+ attr_reader :buffer
11
+
12
+ #
13
+ # The buffer size option, default to 10_000.
14
+ #
15
+ # @return [Integer] The buffer size
16
+ #
17
+ def buffer_size
18
+ @options[:buffer_size] || 10_000
19
+ end
20
+
21
+ #
22
+ # Pushes the row inside the buffer or flushes it when full.
23
+ #
24
+ # @param [Object] row The row to write
25
+ #
26
+ # @return [void]
27
+ #
28
+ def write(row)
29
+ @buffer ||= []
30
+
31
+ @buffer.push row
32
+ flush_buffer if @buffer.size == buffer_size
33
+ end
34
+
35
+ #
36
+ # Flushes the buffer, called when the last row is reached.
37
+ #
38
+ # @return [void]
39
+ #
40
+ def close
41
+ flush_buffer unless @buffer.empty?
42
+ super
43
+ end
44
+
45
+ #
46
+ # Calls #on_flush and reset the buffer.
47
+ #
48
+ # @return [void]
49
+ #
50
+ def flush_buffer
51
+ on_flush
52
+ @buffer = []
53
+ end
54
+
55
+ #
56
+ # Called each time the buffer have to be emptied.
57
+ #
58
+ # @return [void]
59
+ #
60
+ def on_flush
61
+ raise NotImplementedError, "A buffered destination must implement the `#on_flush` method"
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class to implement buffered transforms.
7
+ #
8
+ class BufferedTransform < Transform
9
+ # @return [Array] The row buffer
10
+ attr_reader :buffer
11
+
12
+ #
13
+ # The buffer size option, default to 10_000.
14
+ #
15
+ # @return [Integer] The buffer size
16
+ #
17
+ def buffer_size
18
+ @options[:buffer_size] || 10_000
19
+ end
20
+
21
+ #
22
+ # Pushes the row inside the buffer or flushes it when full.
23
+ #
24
+ # @param [Object] row The row to process
25
+ # @yield [row] The row yielder
26
+ #
27
+ # @return [nil] Returning nil to complies with kiba
28
+ #
29
+ def process(row, &)
30
+ @buffer ||= []
31
+
32
+ @buffer.push row
33
+ flush_buffer(&) if @buffer.size == buffer_size
34
+
35
+ # avoid returning a row, see
36
+ # https://github.com/thbar/kiba/wiki/Implementing-ETL-transforms#generating-more-than-one-output-row-per-input-row-aka-yielding-transforms
37
+ nil
38
+ end
39
+
40
+ #
41
+ # Called when the last row is reached.
42
+ #
43
+ # @yield [row] The row yielder
44
+ #
45
+ # @return [void]
46
+ #
47
+ def close(&)
48
+ flush_buffer(&) unless @buffer.empty?
49
+ super
50
+ end
51
+
52
+ #
53
+ # Calls #on_flush and reset the buffer.
54
+ #
55
+ # @yield [row] The row yielder
56
+ #
57
+ # @return [void]
58
+ #
59
+ def flush_buffer(&)
60
+ on_flush(&)
61
+ @buffer = []
62
+ end
63
+
64
+ #
65
+ # Called each time the buffer have to be emptied.
66
+ #
67
+ # @yield [row] The row yielder
68
+ #
69
+ # @return [void]
70
+ #
71
+ def on_flush(&)
72
+ raise NotImplementedError, "A buffered transform must implement the `#on_flush` method"
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class for all types of ETL control.
7
+ #
8
+ class Control
9
+ extend Forwardable
10
+
11
+ class << self
12
+ # @return [Symbol] The control type, written when registering the control into its adapter
13
+ attr_accessor :type
14
+ end
15
+
16
+ #
17
+ # @!method call_method
18
+ # Invokes the job's method linked to the control.
19
+ # @return [Object] Something returned by the method, e.g. a query, a file, a row, ...
20
+ #
21
+ def_delegator :@job_method, :call, :call_method
22
+
23
+ # @return [Symbol] The method to be called by the control
24
+ attr_reader :job_method
25
+
26
+ # @return [Hash] The configuration hash of the control's adapter
27
+ attr_reader :options
28
+
29
+ # @return [Adapter] The control's adapter
30
+ attr_reader :adapter
31
+
32
+ #
33
+ # Creates a new control based on the job instance and the configured adapter.
34
+ #
35
+ # @param [Method] job_method The job's method to be called by the control
36
+ # @param [Adapter] adapter The configured adapter
37
+ # @param [Hash] **options The configuration hash of the control's adapter
38
+ #
39
+ def initialize(job_method, adapter = nil, **options)
40
+ @job_method = job_method
41
+ @adapter = adapter
42
+ @options = options
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class for implementing destinations.
7
+ #
8
+ class Destination < Control
9
+ #
10
+ # Writes the row into the destination.
11
+ #
12
+ # @param [Object] row The row to write
13
+ #
14
+ # @return [void]
15
+ #
16
+ def write(row)
17
+ call_method(row)
18
+ end
19
+
20
+ #
21
+ # Called when the last row is reached, closes the adapter.
22
+ #
23
+ # @return [void]
24
+ #
25
+ def close; end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class to implement paginated source.
7
+ #
8
+ class PaginatedSource < Source
9
+ #
10
+ # The page size option, default to 10_000.
11
+ #
12
+ # @return [Integer] The page size
13
+ #
14
+ def page_size
15
+ @options[:page_size] || 10_000
16
+ end
17
+
18
+ #
19
+ # Iterates over pages and calls #each_page.
20
+ #
21
+ # @yield [row] The row yielder
22
+ #
23
+ # @return [void]
24
+ #
25
+ def each(&)
26
+ @offset ||= 0
27
+
28
+ loop do
29
+ break unless each_page(&)
30
+
31
+ @offset += page_size
32
+ end
33
+ end
34
+
35
+ #
36
+ # Called once per pages.
37
+ #
38
+ # @yield [row] The row yielder
39
+ #
40
+ # @return [void]
41
+ #
42
+ def each_page(&)
43
+ raise NotImplementedError, "A paginated source must implement the `#each_page` method"
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # The base class for implementing sources.
7
+ #
8
+ class Source < Control
9
+ #
10
+ # Iterates over rows.
11
+ #
12
+ # @yield [row] The row yielder
13
+ #
14
+ # @return [void]
15
+ #
16
+ def each(&)
17
+ call_method.each(&)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class for implementing transforms.
7
+ #
8
+ class Transform < Control
9
+ #
10
+ # Calls the control method and passes the row.
11
+ #
12
+ # @param [Object] row The row to process
13
+ #
14
+ # @return [void]
15
+ #
16
+ def process(row)
17
+ call_method(row)
18
+ end
19
+
20
+ #
21
+ # Called when the last row is reached.
22
+ #
23
+ # @return [void]
24
+ #
25
+ def close; end
26
+ end
27
+ end
28
+ end