safra 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 23e8b482c9f993d696c464b3b1ee7853644f297903c23841b3e92c032413da1d
4
+ data.tar.gz: 305f791964ffd4115c45af7d4f6757eab9eadab5513fa6bf6bc6aec7d00785a2
5
+ SHA512:
6
+ metadata.gz: '09c1f4a12ea0308f25330c83ac8b06359ab90740c451b7ead7cf5f133dbefc254da0d677badc60f04a66c7ceeafc514b84a3544a890a1642c74da4fd35a2b92e'
7
+ data.tar.gz: cb0a781efe89bf7cdd84ecd74311e151ae4b858233325601135908d2d93f693e06cf707c1d0043d098e42ed6bfe3c11c0e4be2402a18b216491947b537db648d
@@ -0,0 +1,22 @@
1
+ require "rails/generators/active_record"
2
+
3
+ module Safra
4
+ module Generators
5
+ class InstallGenerator < Rails::Generators::Base
6
+ include ActiveRecord::Generators::Migration
7
+ source_root File.join(__dir__, "templates")
8
+
9
+ def copy_model_file
10
+ template "request.rb.tt", "app/models/request.rb"
11
+ end
12
+
13
+ def copy_migration
14
+ migration_template "requests_migration.rb", "db/migrate/create_requests.rb", migration_version: migration_version
15
+ end
16
+
17
+ def migration_version
18
+ "[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,20 @@
1
+
2
+ module Extractor
3
+ module Generators
4
+ class PipelineGenerator < Rails::Generators::NamedBase
5
+ source_root File.expand_path('templates', __dir__)
6
+
7
+ def copy_cleaned_file
8
+ template "pipeline/cleaned.sql.tt", "app/sql/#{file_name}/#{file_name}_cleaned.sql"
9
+ end
10
+
11
+ def copy_deduplicated_file
12
+ template "pipeline/deduplicated.sql.tt", "app/sql/#{file_name}/#{file_name}_cleaned_deduplicated.sql"
13
+ end
14
+
15
+ def copy_model_file
16
+ template "pipeline/model.sql.tt", "app/sql/#{file_name}/#{file_name}_model.sql"
17
+ end
18
+ end
19
+ end
20
+ end
@@ -0,0 +1,22 @@
1
+ require "rails/generators/active_record"
2
+
3
+ module Extractor
4
+ module Generators
5
+ class RequestsGenerator < Rails::Generators::Base
6
+ include ActiveRecord::Generators::Migration
7
+ source_root File.join(__dir__, "templates")
8
+
9
+ def copy_model_file
10
+ template "request.rb.tt", "app/models/request.rb"
11
+ end
12
+
13
+ def copy_migration
14
+ migration_template "requests_migration.rb", "db/migrate/create_requests.rb", migration_version: migration_version
15
+ end
16
+
17
+ def migration_version
18
+ "[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]"
19
+ end
20
+ end
21
+ end
22
+ end
@@ -0,0 +1,16 @@
1
+
2
+ module Safra
3
+ module Generators
4
+ class TapGenerator < Rails::Generators::NamedBase
5
+ source_root File.expand_path('templates', __dir__)
6
+
7
+ def copy_tap_file
8
+ template "tap.rb.tt", "app/extractors/#{file_name}_tap.rb"
9
+ end
10
+
11
+ def copy_sql_file
12
+ template "dbt.sql.tt", "app/sql/#{file_name}.sql"
13
+ end
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,5 @@
1
+
2
+ select
3
+ response_options -> 'response_body' body
4
+ from <%%= source 'requests' %>
5
+ where extractor_class = '<%= "#{class_name}Tap" %>'
@@ -0,0 +1,8 @@
1
+ SELECT
2
+ jsonb_path_query(response_options, '$.response_body.data[*]') as body,
3
+ jsonb_path_query(response_options, '$.response_body.data[*].id') as unique_by,
4
+ id as request_id,
5
+ created_at,
6
+ account_id
7
+ FROM <%%= source 'requests' %>
8
+ WHERE extractor_class = '<%= "#{class_name}Tap" %>'
@@ -0,0 +1,9 @@
1
+ <%%= build_as :incremental %>
2
+ SELECT
3
+ DISTINCT ON(unique_by)
4
+ *
5
+ FROM <%%= ref('<%= "#{file_name}_cleaned" %>') %>
6
+ <%% if is_incremental %>
7
+ WHERE created_at > (SELECT coalesce(max(created_at), '-infinity'::DATE) FROM <%%= this %>)
8
+ <%% end %>
9
+ ORDER BY unique_by, created_at DESC
@@ -0,0 +1,7 @@
1
+ SELECT
2
+ unique_by,
3
+ body,
4
+ created_at,
5
+ account_id,
6
+ request_id
7
+ FROM <%%= ref('<%= "#{file_name}_cleaned_deduplicated" %>') %>
@@ -0,0 +1,19 @@
1
+ class Request < ApplicationRecord
2
+ class << self
3
+ Dir.glob('app/extractors/**/*').map {|t| t.scan /extractors\/(.*_tap)\.rb/}.flatten.each do |name|
4
+ define_method(name) {where extractor_class: name.classify}
5
+ end
6
+
7
+ def create_from_response typhoeus_response
8
+ res = Extractor::ResponseWithJson.from_response typhoeus_response
9
+ create!({
10
+ extractor_class: self.class,
11
+ base_url: res.request.base_url,
12
+ request_options: res.request.options,
13
+ request_original_options: res.request.original_options,
14
+ response_options: res.parsed_options,
15
+ request_cache_key: res.request.cache_key
16
+ })
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,16 @@
1
+ class <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %>
2
+ def change
3
+ create_table :requests do |t|
4
+ t.string :extractor_class
5
+ t.string :account_id
6
+ t.string :base_url
7
+ t.jsonb :request_options
8
+ t.jsonb :request_original_options
9
+ t.jsonb :response_options
10
+ t.string :request_cache_key
11
+ t.jsonb :aux
12
+
13
+ t.timestamps
14
+ end
15
+ end
16
+ end
@@ -0,0 +1,20 @@
1
+ class <%= class_name %>Tap < Safra::Tap
2
+
3
+ # Value counts up from 1 by default
4
+ # Customize defining first_value and next_value methods
5
+ def request_for value
6
+ Typhoeus.get "https://jsonplaceholder.typicode.com/posts/#{value}"
7
+ end
8
+
9
+ # Use response.json for a parsed body (nil if invalid json)
10
+ # Return anything other than nil or false if reached the end
11
+ def reached_end? response
12
+ response.body == "{}"
13
+ end
14
+
15
+ # Return nil, false or explode this method on invalid response
16
+ def validate response
17
+ Integer(response.json['id'])
18
+ end
19
+
20
+ end
@@ -0,0 +1,5 @@
1
+ class <%= migration_class_name %> < ActiveRecord::Migration<%= migration_version %>
2
+ def change
3
+ add_column :requests, :aux, :jsonb
4
+ end
5
+ end
@@ -0,0 +1,18 @@
1
+ require "rails/generators/active_record"
2
+
3
+ module Extractor
4
+ module Generators
5
+ class UpdateGenerator < Rails::Generators::Base
6
+ include ActiveRecord::Generators::Migration
7
+ source_root File.join(__dir__, "templates")
8
+
9
+ def copy_migration
10
+ migration_template "update_requests_migration.rb", "db/migrate/update_requests.rb", migration_version: migration_version
11
+ end
12
+
13
+ def migration_version
14
+ "[#{ActiveRecord::VERSION::MAJOR}.#{ActiveRecord::VERSION::MINOR}]"
15
+ end
16
+ end
17
+ end
18
+ end
@@ -0,0 +1,20 @@
1
+ module Safra
2
+ class ResponseWithJson < Typhoeus::Response
3
+ def self.from_response response
4
+ res = new(response.options)
5
+ response.request.response = res
6
+ res.request = response.request
7
+ res
8
+ end
9
+
10
+ def json
11
+ JSON.parse(body) rescue nil
12
+ end
13
+
14
+ def parsed_options
15
+ a = options.dup
16
+ a[:response_body] = JSON.parse(a[:response_body]) rescue a[:response_body]
17
+ a
18
+ end
19
+ end
20
+ end
data/lib/safra/tap.rb ADDED
@@ -0,0 +1,151 @@
1
+ module Safra
2
+ class Tap
3
+
4
+ MAX_RETRIES = 4
5
+ SUPPORTED_ON_MAX_RETRIES_VALUES = [:fail, :save_to_errors, :skip_silently]
6
+ ON_MAX_RETRIES = :fail
7
+ MAX_CONCURRENCY = 200
8
+
9
+ def initialize parameter=nil, auth:{}
10
+ check_on_max_retries
11
+ @auth = auth.with_indifferent_access
12
+ if @auth.present? and @auth[:account_id].blank?
13
+ puts "WARNING: No account_id provided in 'auth:' parameter for #{self.class}"
14
+ end
15
+ @parameter = parameter
16
+
17
+ @request_for_method_name =
18
+ self.class.instance_methods(false)
19
+ .find {|m| m.to_s.start_with?('request_for_')} || :request_for
20
+
21
+ @request_for_batch_size = [
22
+ @request_for_method_name
23
+ .to_s.gsub('request_for_','').to_i,
24
+ 1].max
25
+
26
+ if @parameter.is_a? Array
27
+ @current_value = @parameter.first @request_for_batch_size
28
+ else
29
+ @current_value = first_value || 1
30
+ end
31
+ @start_time = Time.now.to_i
32
+ end
33
+
34
+ def next_value
35
+ if @current_value.is_a? Array
36
+ @parameter -= @current_value
37
+ @parameter.first @request_for_batch_size
38
+ else
39
+ @current_value + 1
40
+ end
41
+ end
42
+
43
+ def reached_end? res
44
+ @parameter.empty?
45
+ end
46
+
47
+ def first_value
48
+ nil
49
+ end
50
+
51
+ def perform
52
+ @retries_count = 0
53
+ while @current_value.present? do
54
+ original_response = send(@request_for_method_name, @request_for_batch_size == 1 ? (@current_value.first rescue @current_value) : @current_value)
55
+ raise "Function request_for() should return a Typhoeus::Response, but returned #{original_response.class}" if original_response.class != Typhoeus::Response
56
+ res = ResponseWithJson.from_response original_response
57
+ response_valid = (validate(res) rescue nil)
58
+ if response_valid
59
+ @last_response = res
60
+ Request.insert! build_request_model(res)
61
+ @retries_count = 0
62
+ if (reached_end?(res) rescue false)
63
+ @current_value = nil
64
+ else
65
+ @current_value = next_value
66
+ end
67
+ else
68
+ if (reached_end?(res) rescue false)
69
+ @current_value = nil
70
+ elsif @retries_count < self.class::MAX_RETRIES
71
+ @retries_count += 1
72
+ puts "sleep #{(2**@retries_count)} then will retry #{'again' if @retries_count > 1}"
73
+ sleep (2**@retries_count)
74
+ redo
75
+ else
76
+ case self.class::ON_MAX_RETRIES
77
+ when :fail
78
+ raise "Maximum number of retries reached (#{self.class::MAX_RETRIES})"
79
+ when :save_to_errors
80
+ Request.insert! build_request_model_for_error(res)
81
+ when :skip_silently
82
+ puts "skiped on value #{@current_value}"
83
+ end
84
+ @current_value = next_value
85
+ @retries_count = 0
86
+ end
87
+ end
88
+
89
+ end
90
+ puts "Completed run of #{self.class}"
91
+ end
92
+
93
+ def parallel_perform
94
+ @hydra = Typhoeus::Hydra.new(max_concurrency: self.class::MAX_CONCURRENCY)
95
+ @parameter.each_slice(@request_for_batch_size) do |batch|
96
+ request = send(@request_for_method_name, (@request_for_batch_size == 1 ? (batch.first rescue batch) : batch))
97
+ raise "Function request_for() should return a Typhoeus::Request, but returned #{request.class}" if request.class != Typhoeus::Request
98
+ request.on_complete do |response|
99
+ res = ResponseWithJson.from_response response
100
+ response_valid = (validate(res) rescue nil)
101
+ if response_valid
102
+ Request.insert! build_request_model(res)
103
+ else
104
+ Request.insert! build_request_model_for_error(res)
105
+ end
106
+ end
107
+ @hydra.queue request
108
+ end
109
+ @hydra.run
110
+ end
111
+
112
+ private
113
+ def build_request_model typhoeus_response
114
+ request_model = {
115
+ extractor_class: self.class,
116
+ account_id: @auth[:account_id],
117
+ base_url: typhoeus_response.request.base_url,
118
+ request_options: typhoeus_response.request.options,
119
+ request_original_options: typhoeus_response.request.original_options,
120
+ response_options: typhoeus_response.parsed_options,
121
+ request_cache_key: typhoeus_response.request.cache_key
122
+ }
123
+ if Request.column_names.include?('aux')
124
+ request_model[:aux] = { value: @current_value, retries: @retries_count, run_id: "#{object_id}-#{@start_time}"}
125
+ end
126
+ request_model
127
+ end
128
+
129
+ def build_request_model_for_error typhoeus_response
130
+ request_model = {
131
+ extractor_class: "#{self.class}_errors",
132
+ account_id: @auth[:account_id],
133
+ base_url: typhoeus_response.request.base_url,
134
+ request_options: typhoeus_response.request.options,
135
+ request_original_options: typhoeus_response.request.original_options,
136
+ response_options: typhoeus_response.parsed_options,
137
+ request_cache_key: typhoeus_response.request.cache_key,
138
+ }
139
+ if Request.column_names.include?('aux')
140
+ request_model[:aux] = { value: @current_value, retries: @retries_count, run_id: "#{object_id}-#{@start_time}"}
141
+ end
142
+ request_model
143
+ end
144
+
145
+ def check_on_max_retries
146
+ unless SUPPORTED_ON_MAX_RETRIES_VALUES.include? self.class::ON_MAX_RETRIES
147
+ raise "Unsuported ON_MAX_RETRIES value #{self.class::ON_MAX_RETRIES} supported values are #{SUPPORTED_ON_MAX_RETRIES_VALUES}"
148
+ end
149
+ end
150
+ end
151
+ end
data/lib/safra.rb ADDED
@@ -0,0 +1,8 @@
1
+ require "zeitwerk"
2
+ loader = Zeitwerk::Loader.for_gem
3
+ loader.ignore("#{__dir__}/generators")
4
+ loader.setup
5
+
6
+ module Safra
7
+
8
+ end
metadata ADDED
@@ -0,0 +1,110 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: safra
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.1
5
+ platform: ruby
6
+ authors:
7
+ - Felipe Mesquita
8
+ bindir: bin
9
+ cert_chain: []
10
+ date: 1980-01-02 00:00:00.000000000 Z
11
+ dependencies:
12
+ - !ruby/object:Gem::Dependency
13
+ name: railties
14
+ requirement: !ruby/object:Gem::Requirement
15
+ requirements:
16
+ - - ">="
17
+ - !ruby/object:Gem::Version
18
+ version: '6'
19
+ type: :runtime
20
+ prerelease: false
21
+ version_requirements: !ruby/object:Gem::Requirement
22
+ requirements:
23
+ - - ">="
24
+ - !ruby/object:Gem::Version
25
+ version: '6'
26
+ - !ruby/object:Gem::Dependency
27
+ name: activerecord
28
+ requirement: !ruby/object:Gem::Requirement
29
+ requirements:
30
+ - - ">="
31
+ - !ruby/object:Gem::Version
32
+ version: '6'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: !ruby/object:Gem::Requirement
36
+ requirements:
37
+ - - ">="
38
+ - !ruby/object:Gem::Version
39
+ version: '6'
40
+ - !ruby/object:Gem::Dependency
41
+ name: zeitwerk
42
+ requirement: !ruby/object:Gem::Requirement
43
+ requirements:
44
+ - - ">="
45
+ - !ruby/object:Gem::Version
46
+ version: '0'
47
+ type: :runtime
48
+ prerelease: false
49
+ version_requirements: !ruby/object:Gem::Requirement
50
+ requirements:
51
+ - - ">="
52
+ - !ruby/object:Gem::Version
53
+ version: '0'
54
+ - !ruby/object:Gem::Dependency
55
+ name: typhoeus
56
+ requirement: !ruby/object:Gem::Requirement
57
+ requirements:
58
+ - - ">="
59
+ - !ruby/object:Gem::Version
60
+ version: '0'
61
+ type: :runtime
62
+ prerelease: false
63
+ version_requirements: !ruby/object:Gem::Requirement
64
+ requirements:
65
+ - - ">="
66
+ - !ruby/object:Gem::Version
67
+ version: '0'
68
+ email: felipemesquita@hey.com
69
+ executables: []
70
+ extensions: []
71
+ extra_rdoc_files: []
72
+ files:
73
+ - lib/generators/safra/install_generator.rb
74
+ - lib/generators/safra/pipeline_generator.rb
75
+ - lib/generators/safra/requests_generator.rb
76
+ - lib/generators/safra/tap_generator.rb
77
+ - lib/generators/safra/templates/dbt.sql.tt
78
+ - lib/generators/safra/templates/pipeline/cleaned.sql.tt
79
+ - lib/generators/safra/templates/pipeline/deduplicated.sql.tt
80
+ - lib/generators/safra/templates/pipeline/model.sql.tt
81
+ - lib/generators/safra/templates/request.rb.tt
82
+ - lib/generators/safra/templates/requests_migration.rb.tt
83
+ - lib/generators/safra/templates/tap.rb.tt
84
+ - lib/generators/safra/templates/update_requests_migration.rb.tt
85
+ - lib/generators/safra/update_generator.rb
86
+ - lib/safra.rb
87
+ - lib/safra/response_with_json.rb
88
+ - lib/safra/tap.rb
89
+ homepage: https://github.com/felipedmesquita/extractor
90
+ licenses:
91
+ - MIT
92
+ metadata: {}
93
+ rdoc_options: []
94
+ require_paths:
95
+ - lib
96
+ required_ruby_version: !ruby/object:Gem::Requirement
97
+ requirements:
98
+ - - ">="
99
+ - !ruby/object:Gem::Version
100
+ version: '2.7'
101
+ required_rubygems_version: !ruby/object:Gem::Requirement
102
+ requirements:
103
+ - - ">="
104
+ - !ruby/object:Gem::Version
105
+ version: '0'
106
+ requirements: []
107
+ rubygems_version: 3.6.9
108
+ specification_version: 4
109
+ summary: Extract data from APIs with mininal configuration
110
+ test_files: []