multiwoven-integrations 0.22.1 → 0.22.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/multiwoven/integrations/core/unstructured_source_connector.rb +52 -0
- data/lib/multiwoven/integrations/protocol/protocol.rb +1 -1
- data/lib/multiwoven/integrations/rollout.rb +1 -1
- data/lib/multiwoven/integrations/source/amazon_s3/client.rb +103 -10
- data/lib/multiwoven/integrations/source/amazon_s3/config/spec.json +13 -4
- data/lib/multiwoven/integrations/source/sftp/client.rb +5 -1
- data/lib/multiwoven/integrations.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2352e4287b9b68a5f417a413da77bb2776c5e17e0330373f49ec182e3021aab3
|
4
|
+
data.tar.gz: cdae111a370a6e5a637141463b001278569538b71fac1f229c4a8ee5cc4b0725
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3b7ca0805127b15270285a29fa13a45a847d259c7a74ff1fa1f6462bb3822ccd09237dc7d6a6345eb846e294233b94397fe150ab602e2df332e85aafc0d9af70
|
7
|
+
data.tar.gz: 422b9ff1ea7baad83d1a79066a09cdfcebbb509aff3e9768e20b0120b83043865e634dacf66750239a2cd2ef1e7309e7483baca0c698bb30c81f81ffa41ee07d
|
@@ -0,0 +1,52 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Multiwoven
|
4
|
+
module Integrations::Core
|
5
|
+
class UnstructuredSourceConnector < SourceConnector
|
6
|
+
UNSTRUCTURED_SCHEMA = {
|
7
|
+
type: "object",
|
8
|
+
properties: {
|
9
|
+
element_id: { type: "string" },
|
10
|
+
text: { type: "string" },
|
11
|
+
created_date: { type: "string" },
|
12
|
+
modified_date: { type: "string" },
|
13
|
+
filename: { type: "string" },
|
14
|
+
filetype: { type: "string" }
|
15
|
+
},
|
16
|
+
required: %w[
|
17
|
+
element_id
|
18
|
+
text
|
19
|
+
created_date
|
20
|
+
modified_date
|
21
|
+
filename
|
22
|
+
filetype
|
23
|
+
]
|
24
|
+
}.freeze
|
25
|
+
|
26
|
+
UNSTRUCTURED_STREAM_CONFIG = {
|
27
|
+
supported_sync_modes: ["incremental"],
|
28
|
+
source_defined_cursor: true,
|
29
|
+
default_cursor_field: ["modified_date"],
|
30
|
+
source_defined_primary_key: [["element_id"]]
|
31
|
+
}.freeze
|
32
|
+
|
33
|
+
# Commands for unstructured data operations
|
34
|
+
UNSTRUCTURED = "unstructured"
|
35
|
+
LIST_FILES_CMD = "list_files"
|
36
|
+
DOWNLOAD_FILE_CMD = "download_file"
|
37
|
+
|
38
|
+
def unstructured_data?(connection_config)
|
39
|
+
connection_config["data_type"] == UNSTRUCTURED
|
40
|
+
end
|
41
|
+
|
42
|
+
def create_unstructured_stream
|
43
|
+
Multiwoven::Integrations::Protocol::Stream.new(
|
44
|
+
name: UNSTRUCTURED,
|
45
|
+
action: StreamAction["fetch"],
|
46
|
+
json_schema: UNSTRUCTURED_SCHEMA,
|
47
|
+
**UNSTRUCTURED_STREAM_CONFIG
|
48
|
+
)
|
49
|
+
end
|
50
|
+
end
|
51
|
+
end
|
52
|
+
end
|
@@ -10,7 +10,7 @@ module Multiwoven
|
|
10
10
|
SyncStatus = Types::String.enum("started", "running", "complete", "incomplete")
|
11
11
|
DestinationSyncMode = Types::String.enum("insert", "upsert")
|
12
12
|
ConnectorType = Types::String.enum("source", "destination")
|
13
|
-
ConnectorQueryType = Types::String.enum("raw_sql", "soql", "ai_ml")
|
13
|
+
ConnectorQueryType = Types::String.enum("raw_sql", "soql", "ai_ml", "unstructured")
|
14
14
|
ModelQueryType = Types::String.enum("raw_sql", "dbt", "soql", "table_selector", "ai_ml", "dynamic_sql")
|
15
15
|
ConnectionStatusType = Types::String.enum("succeeded", "failed")
|
16
16
|
StreamType = Types::String.enum("static", "dynamic", "user_defined")
|
@@ -3,14 +3,21 @@
|
|
3
3
|
module Multiwoven::Integrations::Source
|
4
4
|
module AmazonS3
|
5
5
|
include Multiwoven::Integrations::Core
|
6
|
-
class Client <
|
6
|
+
class Client < UnstructuredSourceConnector
|
7
7
|
@session_name = ""
|
8
|
+
|
8
9
|
def check_connection(connection_config)
|
9
10
|
connection_config = connection_config.with_indifferent_access
|
10
11
|
@session_name = "connection-#{connection_config[:region]}-#{connection_config[:bucket]}"
|
11
|
-
|
12
|
-
|
13
|
-
|
12
|
+
|
13
|
+
if unstructured_data?(connection_config)
|
14
|
+
create_s3_connection(connection_config)
|
15
|
+
@s3_resource.bucket(connection_config[:bucket]).objects.limit(1).first
|
16
|
+
else
|
17
|
+
conn = create_connection(connection_config)
|
18
|
+
path = build_path(connection_config)
|
19
|
+
get_results(conn, "DESCRIBE SELECT * FROM '#{path}';")
|
20
|
+
end
|
14
21
|
ConnectionStatus.new(status: ConnectionStatusType["succeeded"]).to_multiwoven_message
|
15
22
|
rescue StandardError => e
|
16
23
|
ConnectionStatus.new(status: ConnectionStatusType["failed"], message: e.message).to_multiwoven_message
|
@@ -19,12 +26,17 @@ module Multiwoven::Integrations::Source
|
|
19
26
|
def discover(connection_config)
|
20
27
|
connection_config = connection_config.with_indifferent_access
|
21
28
|
@session_name = "discover-#{connection_config[:region]}-#{connection_config[:bucket]}"
|
22
|
-
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
|
27
|
-
|
29
|
+
|
30
|
+
streams = if unstructured_data?(connection_config)
|
31
|
+
[create_unstructured_stream]
|
32
|
+
else
|
33
|
+
conn = create_connection(connection_config)
|
34
|
+
# If pulling from multiple files, all files must have the same schema
|
35
|
+
path = build_path(connection_config)
|
36
|
+
records = get_results(conn, "DESCRIBE SELECT * FROM '#{path}';")
|
37
|
+
columns = build_discover_columns(records)
|
38
|
+
[Multiwoven::Integrations::Protocol::Stream.new(name: path, action: StreamAction["fetch"], json_schema: convert_to_json_schema(columns))]
|
39
|
+
end
|
28
40
|
catalog = Catalog.new(streams: streams)
|
29
41
|
catalog.to_multiwoven_message
|
30
42
|
rescue StandardError => e
|
@@ -34,6 +46,9 @@ module Multiwoven::Integrations::Source
|
|
34
46
|
def read(sync_config)
|
35
47
|
connection_config = sync_config.source.connection_specification.with_indifferent_access
|
36
48
|
@session_name = "#{sync_config.sync_id}-#{sync_config.source.name}-#{sync_config.destination.name}"
|
49
|
+
|
50
|
+
return handle_unstructured_data(sync_config) if unstructured_data?(connection_config)
|
51
|
+
|
37
52
|
conn = create_connection(connection_config)
|
38
53
|
query = sync_config.model.query
|
39
54
|
query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
|
@@ -69,6 +84,19 @@ module Multiwoven::Integrations::Source
|
|
69
84
|
end
|
70
85
|
end
|
71
86
|
|
87
|
+
def create_s3_connection(connection_config)
|
88
|
+
connection_config = connection_config.with_indifferent_access
|
89
|
+
|
90
|
+
# Get authentication credentials
|
91
|
+
auth_data = get_auth_data(connection_config)
|
92
|
+
|
93
|
+
# Create S3 resource for easier operations
|
94
|
+
@s3_resource = Aws::S3::Resource.new(
|
95
|
+
region: connection_config[:region],
|
96
|
+
credentials: auth_data
|
97
|
+
)
|
98
|
+
end
|
99
|
+
|
72
100
|
def create_connection(connection_config)
|
73
101
|
# In the case when previewing a query
|
74
102
|
@session_name = "preview-#{connection_config[:region]}-#{connection_config[:bucket]}" if @session_name.to_s.empty?
|
@@ -137,6 +165,71 @@ module Multiwoven::Integrations::Source
|
|
137
165
|
"boolean"
|
138
166
|
end
|
139
167
|
end
|
168
|
+
|
169
|
+
def handle_unstructured_data(sync_config)
|
170
|
+
connection_config = sync_config.source.connection_specification.with_indifferent_access
|
171
|
+
bucket_name = connection_config[:bucket]
|
172
|
+
command = sync_config.model.query.strip
|
173
|
+
create_s3_connection(connection_config)
|
174
|
+
|
175
|
+
case command
|
176
|
+
when LIST_FILES_CMD
|
177
|
+
list_files_in_folder(bucket_name, connection_config[:path] || "")
|
178
|
+
when /^#{DOWNLOAD_FILE_CMD}\s+(.+)$/
|
179
|
+
# Extract the file path and remove surrounding quotes if present
|
180
|
+
file_path = ::Regexp.last_match(1).strip
|
181
|
+
file_path = file_path.gsub(/^["']|["']$/, "") # Remove leading/trailing quotes
|
182
|
+
download_file_to_local(bucket_name, file_path, sync_config.sync_id)
|
183
|
+
else
|
184
|
+
raise "Invalid command. Supported commands: #{LIST_FILES_CMD}, #{DOWNLOAD_FILE_CMD} <file_path>"
|
185
|
+
end
|
186
|
+
end
|
187
|
+
|
188
|
+
def list_files_in_folder(bucket_name, folder_path)
|
189
|
+
folder_path = folder_path.end_with?("/") ? folder_path : "#{folder_path}/"
|
190
|
+
bucket = @s3_resource.bucket(bucket_name)
|
191
|
+
|
192
|
+
bucket.objects(prefix: folder_path).reject { |object| object.key == folder_path }.map do |object|
|
193
|
+
RecordMessage.new(
|
194
|
+
data: {
|
195
|
+
file_name: File.basename(object.key),
|
196
|
+
file_path: object.key,
|
197
|
+
size: object.content_length,
|
198
|
+
file_type: File.extname(object.key).sub(".", ""),
|
199
|
+
created_date: object.last_modified.to_s,
|
200
|
+
modified_date: object.last_modified.to_s
|
201
|
+
},
|
202
|
+
emitted_at: Time.now.to_i
|
203
|
+
).to_multiwoven_message
|
204
|
+
end
|
205
|
+
end
|
206
|
+
|
207
|
+
def download_file_to_local(bucket_name, file_path, sync_id)
|
208
|
+
download_path = ENV["FILE_DOWNLOAD_PATH"]
|
209
|
+
file = if download_path
|
210
|
+
File.join(download_path, "syncs", sync_id, File.basename(file_path))
|
211
|
+
else
|
212
|
+
Tempfile.new(["s3_file", "syncs", sync_id, File.extname(file_path)]).path
|
213
|
+
end
|
214
|
+
|
215
|
+
object = @s3_resource.bucket(bucket_name).object(file_path)
|
216
|
+
object.get(response_target: file)
|
217
|
+
|
218
|
+
[RecordMessage.new(
|
219
|
+
data: {
|
220
|
+
local_path: file,
|
221
|
+
file_name: File.basename(file_path),
|
222
|
+
file_path: file_path,
|
223
|
+
size: object.content_length,
|
224
|
+
file_type: File.extname(file_path).sub(".", ""),
|
225
|
+
modified_date: object.last_modified.to_s,
|
226
|
+
created_date: object.last_modified.to_s
|
227
|
+
},
|
228
|
+
emitted_at: Time.now.to_i
|
229
|
+
).to_multiwoven_message]
|
230
|
+
rescue Aws::S3::Errors::NoSuchKey
|
231
|
+
raise "File not found: #{file_path}"
|
232
|
+
end
|
140
233
|
end
|
141
234
|
end
|
142
235
|
end
|
@@ -19,8 +19,7 @@
|
|
19
19
|
"region",
|
20
20
|
"bucket",
|
21
21
|
"access_id",
|
22
|
-
"secret_access"
|
23
|
-
"file_type"
|
22
|
+
"secret_access"
|
24
23
|
]
|
25
24
|
},
|
26
25
|
"else": {
|
@@ -29,11 +28,21 @@
|
|
29
28
|
"region",
|
30
29
|
"bucket",
|
31
30
|
"arn",
|
32
|
-
"external_id"
|
33
|
-
"file_type"
|
31
|
+
"external_id"
|
34
32
|
]
|
35
33
|
},
|
36
34
|
"properties": {
|
35
|
+
"data_type": {
|
36
|
+
"description": "Type of data in the files",
|
37
|
+
"type": "string",
|
38
|
+
"title": "Data Type",
|
39
|
+
"enum": [
|
40
|
+
"structured",
|
41
|
+
"unstructured"
|
42
|
+
],
|
43
|
+
"default": "structured",
|
44
|
+
"order": 9
|
45
|
+
},
|
37
46
|
"auth_type": {
|
38
47
|
"title": "Authentication type",
|
39
48
|
"type": "string",
|
@@ -87,7 +87,11 @@ module Multiwoven::Integrations::Source
|
|
87
87
|
|
88
88
|
def query(conn, query)
|
89
89
|
@sftp.download!(@remote_file_path, @tempfile.path)
|
90
|
-
|
90
|
+
if query.gsub(/FROM\s+\S+/i).count > 1
|
91
|
+
query = query.gsub("FROM #{@remote_file_path}", "FROM read_csv_auto('#{@tempfile.path}')")
|
92
|
+
elsif query.match?(/\bFROM\b/i)
|
93
|
+
query = query.gsub(/FROM\s+\S+/i, "FROM read_csv_auto('#{@tempfile.path}')")
|
94
|
+
end
|
91
95
|
records = get_results(conn, query)
|
92
96
|
records.map do |row|
|
93
97
|
RecordMessage.new(data: row, emitted_at: Time.now.to_i).to_multiwoven_message
|
@@ -56,6 +56,7 @@ require_relative "integrations/core/http_helper"
|
|
56
56
|
require_relative "integrations/core/http_client"
|
57
57
|
require_relative "integrations/core/streaming_http_client"
|
58
58
|
require_relative "integrations/core/query_builder"
|
59
|
+
require_relative "integrations/core/unstructured_source_connector"
|
59
60
|
|
60
61
|
# Source
|
61
62
|
require_relative "integrations/source/snowflake/client"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: multiwoven-integrations
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.22.
|
4
|
+
version: 0.22.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Subin T P
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-04-
|
11
|
+
date: 2025-04-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|
@@ -585,6 +585,7 @@ files:
|
|
585
585
|
- lib/multiwoven/integrations/core/rate_limiter.rb
|
586
586
|
- lib/multiwoven/integrations/core/source_connector.rb
|
587
587
|
- lib/multiwoven/integrations/core/streaming_http_client.rb
|
588
|
+
- lib/multiwoven/integrations/core/unstructured_source_connector.rb
|
588
589
|
- lib/multiwoven/integrations/core/utils.rb
|
589
590
|
- lib/multiwoven/integrations/destination/airtable/client.rb
|
590
591
|
- lib/multiwoven/integrations/destination/airtable/config/catalog.json
|