multiwoven-integrations 0.22.1 → 0.22.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: fc3297301450525e523cde59f46f1371bd67fa92a31a0a3135ae9818563a9b31
4
- data.tar.gz: 3168abc5dcd112f1cfd86b4daeb19d7501aba221261e5db75d1b10976bd95e6b
3
+ metadata.gz: 2352e4287b9b68a5f417a413da77bb2776c5e17e0330373f49ec182e3021aab3
4
+ data.tar.gz: cdae111a370a6e5a637141463b001278569538b71fac1f229c4a8ee5cc4b0725
5
5
  SHA512:
6
- metadata.gz: e141fddcc210447d6dcf88cae35e051050965d0d3548e7a30f8a4b164e9a8df5fdad9e07d3b44d0d5f5edc714f6e3f52d457b3959a74c8363e88414802198d59
7
- data.tar.gz: e32081015f5afeeeb76c393d8770b6e87901b4146681c2f9419ce75e2a8674c7d306566da0e4ca9d9cb43294019aa986540b094010dfb4a7dcefec4c5cc78558
6
+ metadata.gz: 3b7ca0805127b15270285a29fa13a45a847d259c7a74ff1fa1f6462bb3822ccd09237dc7d6a6345eb846e294233b94397fe150ab602e2df332e85aafc0d9af70
7
+ data.tar.gz: 422b9ff1ea7baad83d1a79066a09cdfcebbb509aff3e9768e20b0120b83043865e634dacf66750239a2cd2ef1e7309e7483baca0c698bb30c81f81ffa41ee07d
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Multiwoven
4
+ module Integrations::Core
5
+ class UnstructuredSourceConnector < SourceConnector
6
+ UNSTRUCTURED_SCHEMA = {
7
+ type: "object",
8
+ properties: {
9
+ element_id: { type: "string" },
10
+ text: { type: "string" },
11
+ created_date: { type: "string" },
12
+ modified_date: { type: "string" },
13
+ filename: { type: "string" },
14
+ filetype: { type: "string" }
15
+ },
16
+ required: %w[
17
+ element_id
18
+ text
19
+ created_date
20
+ modified_date
21
+ filename
22
+ filetype
23
+ ]
24
+ }.freeze
25
+
26
+ UNSTRUCTURED_STREAM_CONFIG = {
27
+ supported_sync_modes: ["incremental"],
28
+ source_defined_cursor: true,
29
+ default_cursor_field: ["modified_date"],
30
+ source_defined_primary_key: [["element_id"]]
31
+ }.freeze
32
+
33
+ # Commands for unstructured data operations
34
+ UNSTRUCTURED = "unstructured"
35
+ LIST_FILES_CMD = "list_files"
36
+ DOWNLOAD_FILE_CMD = "download_file"
37
+
38
+ def unstructured_data?(connection_config)
39
+ connection_config["data_type"] == UNSTRUCTURED
40
+ end
41
+
42
+ def create_unstructured_stream
43
+ Multiwoven::Integrations::Protocol::Stream.new(
44
+ name: UNSTRUCTURED,
45
+ action: StreamAction["fetch"],
46
+ json_schema: UNSTRUCTURED_SCHEMA,
47
+ **UNSTRUCTURED_STREAM_CONFIG
48
+ )
49
+ end
50
+ end
51
+ end
52
+ end
@@ -10,7 +10,7 @@ module Multiwoven
10
10
  SyncStatus = Types::String.enum("started", "running", "complete", "incomplete")
11
11
  DestinationSyncMode = Types::String.enum("insert", "upsert")
12
12
  ConnectorType = Types::String.enum("source", "destination")
13
- ConnectorQueryType = Types::String.enum("raw_sql", "soql", "ai_ml")
13
+ ConnectorQueryType = Types::String.enum("raw_sql", "soql", "ai_ml", "unstructured")
14
14
  ModelQueryType = Types::String.enum("raw_sql", "dbt", "soql", "table_selector", "ai_ml", "dynamic_sql")
15
15
  ConnectionStatusType = Types::String.enum("succeeded", "failed")
16
16
  StreamType = Types::String.enum("static", "dynamic", "user_defined")
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Multiwoven
4
4
  module Integrations
5
- VERSION = "0.22.1"
5
+ VERSION = "0.22.4"
6
6
 
7
7
  ENABLED_SOURCES = %w[
8
8
  Snowflake
@@ -3,14 +3,21 @@
3
3
  module Multiwoven::Integrations::Source
4
4
  module AmazonS3
5
5
  include Multiwoven::Integrations::Core
6
- class Client < SourceConnector
6
+ class Client < UnstructuredSourceConnector
7
7
  @session_name = ""
8
+
8
9
  def check_connection(connection_config)
9
10
  connection_config = connection_config.with_indifferent_access
10
11
  @session_name = "connection-#{connection_config[:region]}-#{connection_config[:bucket]}"
11
- conn = create_connection(connection_config)
12
- path = build_path(connection_config)
13
- get_results(conn, "DESCRIBE SELECT * FROM '#{path}';")
12
+
13
+ if unstructured_data?(connection_config)
14
+ create_s3_connection(connection_config)
15
+ @s3_resource.bucket(connection_config[:bucket]).objects.limit(1).first
16
+ else
17
+ conn = create_connection(connection_config)
18
+ path = build_path(connection_config)
19
+ get_results(conn, "DESCRIBE SELECT * FROM '#{path}';")
20
+ end
14
21
  ConnectionStatus.new(status: ConnectionStatusType["succeeded"]).to_multiwoven_message
15
22
  rescue StandardError => e
16
23
  ConnectionStatus.new(status: ConnectionStatusType["failed"], message: e.message).to_multiwoven_message
@@ -19,12 +26,17 @@ module Multiwoven::Integrations::Source
19
26
  def discover(connection_config)
20
27
  connection_config = connection_config.with_indifferent_access
21
28
  @session_name = "discover-#{connection_config[:region]}-#{connection_config[:bucket]}"
22
- conn = create_connection(connection_config)
23
- # If pulling from multiple files, all files must have the same schema
24
- path = build_path(connection_config)
25
- records = get_results(conn, "DESCRIBE SELECT * FROM '#{path}';")
26
- columns = build_discover_columns(records)
27
- streams = [Multiwoven::Integrations::Protocol::Stream.new(name: path, action: StreamAction["fetch"], json_schema: convert_to_json_schema(columns))]
29
+
30
+ streams = if unstructured_data?(connection_config)
31
+ [create_unstructured_stream]
32
+ else
33
+ conn = create_connection(connection_config)
34
+ # If pulling from multiple files, all files must have the same schema
35
+ path = build_path(connection_config)
36
+ records = get_results(conn, "DESCRIBE SELECT * FROM '#{path}';")
37
+ columns = build_discover_columns(records)
38
+ [Multiwoven::Integrations::Protocol::Stream.new(name: path, action: StreamAction["fetch"], json_schema: convert_to_json_schema(columns))]
39
+ end
28
40
  catalog = Catalog.new(streams: streams)
29
41
  catalog.to_multiwoven_message
30
42
  rescue StandardError => e
@@ -34,6 +46,9 @@ module Multiwoven::Integrations::Source
34
46
  def read(sync_config)
35
47
  connection_config = sync_config.source.connection_specification.with_indifferent_access
36
48
  @session_name = "#{sync_config.sync_id}-#{sync_config.source.name}-#{sync_config.destination.name}"
49
+
50
+ return handle_unstructured_data(sync_config) if unstructured_data?(connection_config)
51
+
37
52
  conn = create_connection(connection_config)
38
53
  query = sync_config.model.query
39
54
  query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
@@ -69,6 +84,19 @@ module Multiwoven::Integrations::Source
69
84
  end
70
85
  end
71
86
 
87
+ def create_s3_connection(connection_config)
88
+ connection_config = connection_config.with_indifferent_access
89
+
90
+ # Get authentication credentials
91
+ auth_data = get_auth_data(connection_config)
92
+
93
+ # Create S3 resource for easier operations
94
+ @s3_resource = Aws::S3::Resource.new(
95
+ region: connection_config[:region],
96
+ credentials: auth_data
97
+ )
98
+ end
99
+
72
100
  def create_connection(connection_config)
73
101
  # In the case when previewing a query
74
102
  @session_name = "preview-#{connection_config[:region]}-#{connection_config[:bucket]}" if @session_name.to_s.empty?
@@ -137,6 +165,71 @@ module Multiwoven::Integrations::Source
137
165
  "boolean"
138
166
  end
139
167
  end
168
+
169
+ def handle_unstructured_data(sync_config)
170
+ connection_config = sync_config.source.connection_specification.with_indifferent_access
171
+ bucket_name = connection_config[:bucket]
172
+ command = sync_config.model.query.strip
173
+ create_s3_connection(connection_config)
174
+
175
+ case command
176
+ when LIST_FILES_CMD
177
+ list_files_in_folder(bucket_name, connection_config[:path] || "")
178
+ when /^#{DOWNLOAD_FILE_CMD}\s+(.+)$/
179
+ # Extract the file path and remove surrounding quotes if present
180
+ file_path = ::Regexp.last_match(1).strip
181
+ file_path = file_path.gsub(/^["']|["']$/, "") # Remove leading/trailing quotes
182
+ download_file_to_local(bucket_name, file_path, sync_config.sync_id)
183
+ else
184
+ raise "Invalid command. Supported commands: #{LIST_FILES_CMD}, #{DOWNLOAD_FILE_CMD} <file_path>"
185
+ end
186
+ end
187
+
188
+ def list_files_in_folder(bucket_name, folder_path)
189
+ folder_path = folder_path.end_with?("/") ? folder_path : "#{folder_path}/"
190
+ bucket = @s3_resource.bucket(bucket_name)
191
+
192
+ bucket.objects(prefix: folder_path).reject { |object| object.key == folder_path }.map do |object|
193
+ RecordMessage.new(
194
+ data: {
195
+ file_name: File.basename(object.key),
196
+ file_path: object.key,
197
+ size: object.content_length,
198
+ file_type: File.extname(object.key).sub(".", ""),
199
+ created_date: object.last_modified.to_s,
200
+ modified_date: object.last_modified.to_s
201
+ },
202
+ emitted_at: Time.now.to_i
203
+ ).to_multiwoven_message
204
+ end
205
+ end
206
+
207
+ def download_file_to_local(bucket_name, file_path, sync_id)
208
+ download_path = ENV["FILE_DOWNLOAD_PATH"]
209
+ file = if download_path
210
+ File.join(download_path, "syncs", sync_id, File.basename(file_path))
211
+ else
212
+ Tempfile.new(["s3_file", "syncs", sync_id, File.extname(file_path)]).path
213
+ end
214
+
215
+ object = @s3_resource.bucket(bucket_name).object(file_path)
216
+ object.get(response_target: file)
217
+
218
+ [RecordMessage.new(
219
+ data: {
220
+ local_path: file,
221
+ file_name: File.basename(file_path),
222
+ file_path: file_path,
223
+ size: object.content_length,
224
+ file_type: File.extname(file_path).sub(".", ""),
225
+ modified_date: object.last_modified.to_s,
226
+ created_date: object.last_modified.to_s
227
+ },
228
+ emitted_at: Time.now.to_i
229
+ ).to_multiwoven_message]
230
+ rescue Aws::S3::Errors::NoSuchKey
231
+ raise "File not found: #{file_path}"
232
+ end
140
233
  end
141
234
  end
142
235
  end
@@ -19,8 +19,7 @@
19
19
  "region",
20
20
  "bucket",
21
21
  "access_id",
22
- "secret_access",
23
- "file_type"
22
+ "secret_access"
24
23
  ]
25
24
  },
26
25
  "else": {
@@ -29,11 +28,21 @@
29
28
  "region",
30
29
  "bucket",
31
30
  "arn",
32
- "external_id",
33
- "file_type"
31
+ "external_id"
34
32
  ]
35
33
  },
36
34
  "properties": {
35
+ "data_type": {
36
+ "description": "Type of data in the files",
37
+ "type": "string",
38
+ "title": "Data Type",
39
+ "enum": [
40
+ "structured",
41
+ "unstructured"
42
+ ],
43
+ "default": "structured",
44
+ "order": 9
45
+ },
37
46
  "auth_type": {
38
47
  "title": "Authentication type",
39
48
  "type": "string",
@@ -87,7 +87,11 @@ module Multiwoven::Integrations::Source
87
87
 
88
88
  def query(conn, query)
89
89
  @sftp.download!(@remote_file_path, @tempfile.path)
90
- query = query.gsub(/FROM\s+\S+/i, "FROM read_csv_auto('#{@tempfile.path}')") if query.match?(/\bFROM\b/i)
90
+ if query.gsub(/FROM\s+\S+/i).count > 1
91
+ query = query.gsub("FROM #{@remote_file_path}", "FROM read_csv_auto('#{@tempfile.path}')")
92
+ elsif query.match?(/\bFROM\b/i)
93
+ query = query.gsub(/FROM\s+\S+/i, "FROM read_csv_auto('#{@tempfile.path}')")
94
+ end
91
95
  records = get_results(conn, query)
92
96
  records.map do |row|
93
97
  RecordMessage.new(data: row, emitted_at: Time.now.to_i).to_multiwoven_message
@@ -56,6 +56,7 @@ require_relative "integrations/core/http_helper"
56
56
  require_relative "integrations/core/http_client"
57
57
  require_relative "integrations/core/streaming_http_client"
58
58
  require_relative "integrations/core/query_builder"
59
+ require_relative "integrations/core/unstructured_source_connector"
59
60
 
60
61
  # Source
61
62
  require_relative "integrations/source/snowflake/client"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: multiwoven-integrations
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.22.1
4
+ version: 0.22.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Subin T P
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-04-08 00:00:00.000000000 Z
11
+ date: 2025-04-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -585,6 +585,7 @@ files:
585
585
  - lib/multiwoven/integrations/core/rate_limiter.rb
586
586
  - lib/multiwoven/integrations/core/source_connector.rb
587
587
  - lib/multiwoven/integrations/core/streaming_http_client.rb
588
+ - lib/multiwoven/integrations/core/unstructured_source_connector.rb
588
589
  - lib/multiwoven/integrations/core/utils.rb
589
590
  - lib/multiwoven/integrations/destination/airtable/client.rb
590
591
  - lib/multiwoven/integrations/destination/airtable/config/catalog.json