ductr 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
Files changed (63) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.rubocop.yml +14 -0
  4. data/.vscode/settings.json +18 -0
  5. data/COPYING +674 -0
  6. data/COPYING.LESSER +165 -0
  7. data/Gemfile +6 -0
  8. data/Gemfile.lock +121 -0
  9. data/README.md +37 -0
  10. data/Rakefile +37 -0
  11. data/bin/console +15 -0
  12. data/bin/setup +8 -0
  13. data/ductr.gemspec +50 -0
  14. data/exe/ductr +24 -0
  15. data/lib/ductr/adapter.rb +94 -0
  16. data/lib/ductr/cli/default.rb +25 -0
  17. data/lib/ductr/cli/main.rb +60 -0
  18. data/lib/ductr/cli/new_project_generator.rb +72 -0
  19. data/lib/ductr/cli/templates/project/bin_ductr.rb +7 -0
  20. data/lib/ductr/cli/templates/project/config_app.rb +5 -0
  21. data/lib/ductr/cli/templates/project/config_development.yml +8 -0
  22. data/lib/ductr/cli/templates/project/config_environment_development.rb +18 -0
  23. data/lib/ductr/cli/templates/project/gemfile.rb +6 -0
  24. data/lib/ductr/cli/templates/project/rubocop.yml +14 -0
  25. data/lib/ductr/cli/templates/project/tool-versions +1 -0
  26. data/lib/ductr/configuration.rb +145 -0
  27. data/lib/ductr/etl/controls/buffered_destination.rb +65 -0
  28. data/lib/ductr/etl/controls/buffered_transform.rb +76 -0
  29. data/lib/ductr/etl/controls/control.rb +46 -0
  30. data/lib/ductr/etl/controls/destination.rb +28 -0
  31. data/lib/ductr/etl/controls/paginated_source.rb +47 -0
  32. data/lib/ductr/etl/controls/source.rb +21 -0
  33. data/lib/ductr/etl/controls/transform.rb +28 -0
  34. data/lib/ductr/etl/fiber_control.rb +136 -0
  35. data/lib/ductr/etl/fiber_runner.rb +68 -0
  36. data/lib/ductr/etl/kiba_runner.rb +26 -0
  37. data/lib/ductr/etl/parser.rb +115 -0
  38. data/lib/ductr/etl/runner.rb +37 -0
  39. data/lib/ductr/etl_job.rb +161 -0
  40. data/lib/ductr/job.rb +58 -0
  41. data/lib/ductr/job_etl_runner.rb +37 -0
  42. data/lib/ductr/job_status.rb +56 -0
  43. data/lib/ductr/kiba_job.rb +130 -0
  44. data/lib/ductr/log/formatters/color_formatter.rb +48 -0
  45. data/lib/ductr/log/logger.rb +169 -0
  46. data/lib/ductr/log/outputs/file_output.rb +30 -0
  47. data/lib/ductr/log/outputs/standard_output.rb +39 -0
  48. data/lib/ductr/pipeline.rb +133 -0
  49. data/lib/ductr/pipeline_runner.rb +95 -0
  50. data/lib/ductr/pipeline_step.rb +92 -0
  51. data/lib/ductr/registry.rb +55 -0
  52. data/lib/ductr/rufus_trigger.rb +106 -0
  53. data/lib/ductr/scheduler.rb +117 -0
  54. data/lib/ductr/store/job_serializer.rb +59 -0
  55. data/lib/ductr/store/job_store.rb +59 -0
  56. data/lib/ductr/store/pipeline_serializer.rb +106 -0
  57. data/lib/ductr/store/pipeline_store.rb +48 -0
  58. data/lib/ductr/store.rb +81 -0
  59. data/lib/ductr/trigger.rb +49 -0
  60. data/lib/ductr/version.rb +6 -0
  61. data/lib/ductr.rb +143 -0
  62. data/sig/ductr.rbs +1107 -0
  63. metadata +292 -0
@@ -0,0 +1,72 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "thor"
4
+ require "thor/group"
5
+
6
+ module Ductr
7
+ module CLI
8
+ #
9
+ # Thor generator to create a new project
10
+ #
11
+ class NewProjectGenerator < Thor::Group
12
+ include Thor::Actions
13
+ desc "Generate a new project"
14
+ argument :name, type: :string, optional: true, default: ""
15
+
16
+ #
17
+ # The templates source used to create a new project
18
+ #
19
+ # @return [String] the templates source absolute path
20
+ #
21
+ def self.source_root
22
+ "#{__dir__}/templates/project"
23
+ end
24
+
25
+ #
26
+ # Doing some setup before generating file,
27
+ # creates the project directory and sets it as destination for the generator
28
+ #
29
+ # @return [void]
30
+ #
31
+ def init
32
+ empty_directory name
33
+ self.destination_root = "#{destination_root}/#{name}"
34
+ end
35
+
36
+ #
37
+ # Creates files in the project's root
38
+ #
39
+ # @return [void]
40
+ #
41
+ def gen_root
42
+ copy_file "gemfile.rb", "Gemfile"
43
+ copy_file "rubocop.yml", ".rubocop.yml"
44
+ copy_file "tool-versions", ".tool-versions"
45
+
46
+ create_file "app/jobs/.gitkeep"
47
+ create_file "app/pipelines/.gitkeep"
48
+ create_file "app/schedulers/.gitkeep"
49
+ end
50
+
51
+ #
52
+ # Creates the bin file for the project
53
+ #
54
+ # @return [void]
55
+ #
56
+ def gen_bin
57
+ copy_file "bin_ductr.rb", "bin/ductr"
58
+ end
59
+
60
+ #
61
+ # Creates files in the `config` folder
62
+ #
63
+ # @return [void]
64
+ #
65
+ def gen_config
66
+ copy_file "config_app.rb", "config/app.rb"
67
+ copy_file "config_development.yml", "config/development.yml"
68
+ copy_file "config_environment_development.rb", "config/environment/development.rb"
69
+ end
70
+ end
71
+ end
72
+ end
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "ductr"
5
+ require_relative "../config/app"
6
+
7
+ Ductr::CLI::Main.start
@@ -0,0 +1,5 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "ductr"
4
+
5
+ require_relative "environment/#{Ductr.env}"
@@ -0,0 +1,8 @@
1
+ # Add your vars here, you can make erb bindings e.g.:
2
+ # example: <%= ENV.fetch("SOME_VAR", "some default value") %>
3
+
4
+ # declare your adapters here
5
+ adapters:
6
+ # my_adapter:
7
+ # adapter: sqlite
8
+ # database: some.db
@@ -0,0 +1,18 @@
1
+ # frozen_string_literal: true
2
+ require "active_support/cache/file_store"
3
+
4
+ Ductr.configure do |config|
5
+ # Store configuration.
6
+ #
7
+ # You can pass an `ActiveSupport::Cache::Store` class or a symbol
8
+ config.store(ActiveSupport::Cache::FileStore, "tmp/store")
9
+
10
+ # Logging configuration.
11
+ #
12
+ # The following logging levels are available:
13
+ # :debug, :info, :warn, :error, :fatal
14
+ config.logging.level = :debug
15
+
16
+ # Append logs to the stdout by default, you can add/replace outputs at will.
17
+ config.logging.add_output(Ductr::Log::StandardOutput, Ductr::Log::ColorFormatter)
18
+ end
@@ -0,0 +1,6 @@
1
+ # frozen_string_literal: true
2
+
3
+ source "https://rubygems.org"
4
+
5
+ gem "annotable", git: "git@gitlab.com:la-manufacture/rocket/annotable.git", branch: "master"
6
+ gem "ductr", git: "git@gitlab.com:la-manufacture/rocket/ductr.git", branch: "main"
@@ -0,0 +1,14 @@
1
+ AllCops:
2
+ TargetRubyVersion: 3.1
3
+ NewCops: enable
4
+
5
+ Style/StringLiterals:
6
+ Enabled: true
7
+ EnforcedStyle: double_quotes
8
+
9
+ Style/StringLiteralsInInterpolation:
10
+ Enabled: true
11
+ EnforcedStyle: double_quotes
12
+
13
+ Layout/LineLength:
14
+ Max: 120
@@ -0,0 +1 @@
1
+ ruby 3.1.0
@@ -0,0 +1,145 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "yaml"
4
+ require "erb"
5
+
6
+ module Ductr
7
+ #
8
+ # Contains the framework's configuration, including:
9
+ # - `DUCTR_ENV` environment variable
10
+ # - `Ductr.configure` block
11
+ # - Project root path
12
+ # - YML file configuration
13
+ #
14
+ class Configuration
15
+ # @return [Struct] The active job configuration, available options are
16
+ # `queue_adapter`, `default_queue_name`, `queue_name_prefix` & `queue_name_delimiter`
17
+ attr_reader :active_job
18
+
19
+ # @return [Class<Ductr::Log::Logger>] The logger constant
20
+ attr_reader :logging
21
+
22
+ # @return [String] The project root
23
+ attr_reader :root
24
+
25
+ # @return [Class<ActiveSupport::Cache::Store>, Symbol] The store adapter to use
26
+ # @see https://edgeapi.rubyonrails.org/classes/ActiveSupport/Cache.html#method-c-lookup_store
27
+ attr_reader :store_adapter
28
+
29
+ # @return [Array] The store adapter config
30
+ attr_reader :store_parameters
31
+
32
+ # @return [Hash] The parsed YML configuration
33
+ attr_reader :yml
34
+
35
+ #
36
+ # Initializing environment to "development" by default, setting project root, parsing YML with the config gem
37
+ # and aliasing semantic logger gem constant to make it usable through the `Ductr.configure` block
38
+ #
39
+ def initialize(env)
40
+ @root = Dir.pwd
41
+ @yml = load_yaml("#{root}/config/#{env}.yml")
42
+
43
+ @logging = Log::Logger
44
+ logging.level = :debug
45
+
46
+ @active_job = Struct.new(:queue_adapter, :default_queue_name, :queue_name_prefix, :queue_name_delimiter).new
47
+ @store_adapter = ActiveSupport::Cache::FileStore
48
+ @store_parameters = ["tmp/store"]
49
+ end
50
+
51
+ #
52
+ # Configures the store instance.
53
+ #
54
+ # @param [Class<ActiveSupport::Cache::Store>, Symbol] adapter The store adapter class
55
+ # @param [Array] *parameters The store adapter configuration
56
+ #
57
+ # @return [void]
58
+ #
59
+ def store(adapter, *parameters)
60
+ @store_adapter = adapter
61
+ @store_parameters = parameters
62
+ end
63
+
64
+ #
65
+ # Memoize configured adapters based on the YAML configuration.
66
+ #
67
+ # @return [Array<Adapter>] The configured Adapter instances
68
+ #
69
+ def adapters
70
+ @adapters ||= yml.adapters.to_h.map do |name, entry|
71
+ adapter_class = Ductr.adapter_registry.find(entry.adapter)
72
+ config = entry.to_h.except(:adapter)
73
+
74
+ adapter_class.new(name, **config)
75
+ end
76
+ end
77
+
78
+ #
79
+ # Find an adapter based on its name.
80
+ #
81
+ # @param [Symbol] name The name of the adapter to find
82
+ #
83
+ # @raise [AdapterNotFoundError] If no adapter match the given name
84
+ # @return [Adapter] The adapter found
85
+ #
86
+ def adapter(name)
87
+ not_found_error = -> { raise AdapterNotFoundError, "The adapter named \"#{name}\" does not exist" }
88
+
89
+ adapters.find(not_found_error) do |adapter|
90
+ adapter.name == name
91
+ end
92
+ end
93
+
94
+ #
95
+ # Configures active job with the given options.
96
+ #
97
+ # @return [void]
98
+ #
99
+ def apply_active_job_config
100
+ ActiveJob::Base.logger = logging.new("ActiveJob")
101
+
102
+ active_job.each_pair do |opt, value|
103
+ next unless value
104
+
105
+ ActiveJob::Base.send("#{opt}=", value)
106
+ end
107
+ end
108
+
109
+ private
110
+
111
+ #
112
+ # Load YAML configuration localized at given path.
113
+ # Parse the file with ERB before parsing YAML, so we can use env vars in config files.
114
+ #
115
+ # @param [String] path The path of the YAML file to load
116
+ #
117
+ # @return [Struct] The parsed YAML configuration
118
+ #
119
+ def load_yaml(path)
120
+ return {} unless path && File.exist?(path)
121
+
122
+ erb = ERB.new File.read(path)
123
+ yaml = YAML.load(erb.result, symbolize_names: true)
124
+
125
+ hash_to_struct(yaml)
126
+ end
127
+
128
+ #
129
+ # Recursively convert Hash into Struct.
130
+ #
131
+ # @param [Hash] hash The hash to convert
132
+ #
133
+ # @return [Struct] The converted hash
134
+ #
135
+ def hash_to_struct(hash)
136
+ values = hash.values.map do |value|
137
+ next hash_to_struct(value) if value.is_a?(Hash)
138
+
139
+ value
140
+ end
141
+
142
+ Struct.new(*hash.keys).new(*values)
143
+ end
144
+ end
145
+ end
@@ -0,0 +1,65 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class to implement buffered destinations.
7
+ #
8
+ class BufferedDestination < Destination
9
+ # @return [Array] The row buffer
10
+ attr_reader :buffer
11
+
12
+ #
13
+ # The buffer size option, default to 10_000.
14
+ #
15
+ # @return [Integer] The buffer size
16
+ #
17
+ def buffer_size
18
+ @options[:buffer_size] || 10_000
19
+ end
20
+
21
+ #
22
+ # Pushes the row inside the buffer or flushes it when full.
23
+ #
24
+ # @param [Object] row The row to write
25
+ #
26
+ # @return [void]
27
+ #
28
+ def write(row)
29
+ @buffer ||= []
30
+
31
+ @buffer.push row
32
+ flush_buffer if @buffer.size == buffer_size
33
+ end
34
+
35
+ #
36
+ # Flushes the buffer, called when the last row is reached.
37
+ #
38
+ # @return [void]
39
+ #
40
+ def close
41
+ flush_buffer unless @buffer.empty?
42
+ super
43
+ end
44
+
45
+ #
46
+ # Calls #on_flush and reset the buffer.
47
+ #
48
+ # @return [void]
49
+ #
50
+ def flush_buffer
51
+ on_flush
52
+ @buffer = []
53
+ end
54
+
55
+ #
56
+ # Called each time the buffer have to be emptied.
57
+ #
58
+ # @return [void]
59
+ #
60
+ def on_flush
61
+ raise NotImplementedError, "A buffered destination must implement the `#on_flush` method"
62
+ end
63
+ end
64
+ end
65
+ end
@@ -0,0 +1,76 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class to implement buffered transforms.
7
+ #
8
+ class BufferedTransform < Transform
9
+ # @return [Array] The row buffer
10
+ attr_reader :buffer
11
+
12
+ #
13
+ # The buffer size option, default to 10_000.
14
+ #
15
+ # @return [Integer] The buffer size
16
+ #
17
+ def buffer_size
18
+ @options[:buffer_size] || 10_000
19
+ end
20
+
21
+ #
22
+ # Pushes the row inside the buffer or flushes it when full.
23
+ #
24
+ # @param [Object] row The row to process
25
+ # @yield [row] The row yielder
26
+ #
27
+ # @return [nil] Returning nil to complies with kiba
28
+ #
29
+ def process(row, &)
30
+ @buffer ||= []
31
+
32
+ @buffer.push row
33
+ flush_buffer(&) if @buffer.size == buffer_size
34
+
35
+ # avoid returning a row, see
36
+ # https://github.com/thbar/kiba/wiki/Implementing-ETL-transforms#generating-more-than-one-output-row-per-input-row-aka-yielding-transforms
37
+ nil
38
+ end
39
+
40
+ #
41
+ # Called when the last row is reached.
42
+ #
43
+ # @yield [row] The row yielder
44
+ #
45
+ # @return [void]
46
+ #
47
+ def close(&)
48
+ flush_buffer(&) unless @buffer.empty?
49
+ super
50
+ end
51
+
52
+ #
53
+ # Calls #on_flush and reset the buffer.
54
+ #
55
+ # @yield [row] The row yielder
56
+ #
57
+ # @return [void]
58
+ #
59
+ def flush_buffer(&)
60
+ on_flush(&)
61
+ @buffer = []
62
+ end
63
+
64
+ #
65
+ # Called each time the buffer have to be emptied.
66
+ #
67
+ # @yield [row] The row yielder
68
+ #
69
+ # @return [void]
70
+ #
71
+ def on_flush(&)
72
+ raise NotImplementedError, "A buffered transform must implement the `#on_flush` method"
73
+ end
74
+ end
75
+ end
76
+ end
@@ -0,0 +1,46 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class for all types of ETL control.
7
+ #
8
+ class Control
9
+ extend Forwardable
10
+
11
+ class << self
12
+ # @return [Symbol] The control type, written when registering the control into its adapter
13
+ attr_accessor :type
14
+ end
15
+
16
+ #
17
+ # @!method call_method
18
+ # Invokes the job's method linked to the control.
19
+ # @return [Object] Something returned by the method, e.g. a query, a file, a row, ...
20
+ #
21
+ def_delegator :@job_method, :call, :call_method
22
+
23
+ # @return [Symbol] The method to be called by the control
24
+ attr_reader :job_method
25
+
26
+ # @return [Hash] The configuration hash of the control's adapter
27
+ attr_reader :options
28
+
29
+ # @return [Adapter] The control's adapter
30
+ attr_reader :adapter
31
+
32
+ #
33
+ # Creates a new control based on the job instance and the configured adapter.
34
+ #
35
+ # @param [Method] job_method The job's method to be called by the control
36
+ # @param [Adapter] adapter The configured adapter
37
+ # @param [Hash] **options The configuration hash of the control's adapter
38
+ #
39
+ def initialize(job_method, adapter = nil, **options)
40
+ @job_method = job_method
41
+ @adapter = adapter
42
+ @options = options
43
+ end
44
+ end
45
+ end
46
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class for implementing destinations.
7
+ #
8
+ class Destination < Control
9
+ #
10
+ # Writes the row into the destination.
11
+ #
12
+ # @param [Object] row The row to write
13
+ #
14
+ # @return [void]
15
+ #
16
+ def write(row)
17
+ call_method(row)
18
+ end
19
+
20
+ #
21
+ # Called when the last row is reached, closes the adapter.
22
+ #
23
+ # @return [void]
24
+ #
25
+ def close; end
26
+ end
27
+ end
28
+ end
@@ -0,0 +1,47 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class to implement paginated source.
7
+ #
8
+ class PaginatedSource < Source
9
+ #
10
+ # The page size option, default to 10_000.
11
+ #
12
+ # @return [Integer] The page size
13
+ #
14
+ def page_size
15
+ @options[:page_size] || 10_000
16
+ end
17
+
18
+ #
19
+ # Iterates over pages and calls #each_page.
20
+ #
21
+ # @yield [row] The row yielder
22
+ #
23
+ # @return [void]
24
+ #
25
+ def each(&)
26
+ @offset ||= 0
27
+
28
+ loop do
29
+ break unless each_page(&)
30
+
31
+ @offset += page_size
32
+ end
33
+ end
34
+
35
+ #
36
+ # Called once per pages.
37
+ #
38
+ # @yield [row] The row yielder
39
+ #
40
+ # @return [void]
41
+ #
42
+ def each_page(&)
43
+ raise NotImplementedError, "A paginated source must implement the `#each_page` method"
44
+ end
45
+ end
46
+ end
47
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # The base class for implementing sources.
7
+ #
8
+ class Source < Control
9
+ #
10
+ # Iterates over rows.
11
+ #
12
+ # @yield [row] The row yielder
13
+ #
14
+ # @return [void]
15
+ #
16
+ def each(&)
17
+ call_method.each(&)
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,28 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Ductr
4
+ module ETL
5
+ #
6
+ # Base class for implementing transforms.
7
+ #
8
+ class Transform < Control
9
+ #
10
+ # Calls the control method and passes the row.
11
+ #
12
+ # @param [Object] row The row to process
13
+ #
14
+ # @return [void]
15
+ #
16
+ def process(row)
17
+ call_method(row)
18
+ end
19
+
20
+ #
21
+ # Called when the last row is reached.
22
+ #
23
+ # @return [void]
24
+ #
25
+ def close; end
26
+ end
27
+ end
28
+ end