multiwoven-integrations 0.34.8 → 0.34.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 68782aec4005294c631d307cf3b4fbff160ca94db2321cce3dd92b7b4b11e0c4
4
- data.tar.gz: ee482c5b9c92c123e3c2901313e84b02ce2f3f027fd279baa4e9d826e2fd7f8c
3
+ metadata.gz: a3146b56f87f49f572cc5c4ae740846e708137eb55881d87ba7982baa9f89768
4
+ data.tar.gz: 0474f983113e4e5b58af75c41c041a2d90f50d448872d23a6435a9711bbd7bca
5
5
  SHA512:
6
- metadata.gz: 8881241bd2eeab50a20881892784f7b27a2e601ecafbbc006981695d2e00b6475762b71a8810c3c672b6c77624bdb5b08e0e35dcc397d7fec25098ee566846e1
7
- data.tar.gz: 465090979a77c9ab8e70bf1c11771f284392d112d6316388ef78f67d44028628ff495daa195f8550b72edf9f3ccc7c359b36a6770879925e2602508c4eff6db1
6
+ metadata.gz: f62121314c055fa88538b0ecbb7c03837d83d6ed1d6a11571e0fece48066739590aab30b8f6d3b301f8cf07dc0c3a681b687b90d4744eb1d5603050291725165
7
+ data.tar.gz: 8bbc5c2e576eda8cfaa35241bd2b7edc2cc62eb05c0f7e6edd9947db5bef96946637b5ff5aec5b2c74961c87fc8f13858c7062529ad6194618e0991b2c7c075f
@@ -30,8 +30,11 @@ module Multiwoven
30
30
  source_defined_primary_key: [["element_id"]]
31
31
  }.freeze
32
32
 
33
- # Commands for unstructured data operations
33
+ # Data types
34
34
  UNSTRUCTURED = "unstructured"
35
+ SEMISTRUCTURED = "semistructured"
36
+
37
+ # Commands for unstructured & semi-structured data operations
35
38
  LIST_FILES_CMD = "list_files"
36
39
  DOWNLOAD_FILE_CMD = "download_file"
37
40
 
@@ -39,6 +42,10 @@ module Multiwoven
39
42
  connection_config["data_type"] == UNSTRUCTURED
40
43
  end
41
44
 
45
+ def semistructured_data?(connection_config)
46
+ connection_config["data_type"] == SEMISTRUCTURED
47
+ end
48
+
42
49
  def create_unstructured_stream
43
50
  Multiwoven::Integrations::Protocol::Stream.new(
44
51
  name: UNSTRUCTURED,
@@ -47,6 +54,15 @@ module Multiwoven
47
54
  **UNSTRUCTURED_STREAM_CONFIG
48
55
  )
49
56
  end
57
+
58
+ def create_semistructured_stream
59
+ Multiwoven::Integrations::Protocol::Stream.new(
60
+ name: SEMISTRUCTURED,
61
+ action: StreamAction["fetch"],
62
+ json_schema: UNSTRUCTURED_SCHEMA,
63
+ **UNSTRUCTURED_STREAM_CONFIG
64
+ )
65
+ end
50
66
  end
51
67
  end
52
68
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Multiwoven
4
4
  module Integrations
5
- VERSION = "0.34.8"
5
+ VERSION = "0.34.10"
6
6
 
7
7
  ENABLED_SOURCES = %w[
8
8
  Snowflake
@@ -4,25 +4,38 @@ module Multiwoven::Integrations::Source
4
4
  module GoogleDrive
5
5
  include Multiwoven::Integrations::Core
6
6
 
7
- FIELDS = "files(id, name, parents, mimeType), nextPageToken"
7
+ FIELDS = "files(id, name, parents, mimeType, fileExtension, size, createdTime, modifiedTime), nextPageToken"
8
8
  MAX_PER_PAGE = 1000
9
9
  MIMETYPE_GOOGLE_DRIVE_FOLDER = "mimeType = 'application/vnd.google-apps.folder'"
10
10
 
11
- class Client < SourceConnector
11
+ class Client < UnstructuredSourceConnector
12
12
  def check_connection(connection_config)
13
13
  connection_config = connection_config.with_indifferent_access
14
- client = create_connection(connection_config)
15
- build_query(client)
14
+
15
+ if unstructured_data?(connection_config) || semistructured_data?(connection_config)
16
+ create_drive_connection(connection_config)
17
+ folder_name = connection_config[:folder_name]
18
+ build_query(folder_name)
19
+ else
20
+ create_connection(connection_config)
21
+ end
16
22
  success_status
17
- rescue StandardError => e
23
+ rescue StandardError, NotImplementedError => e
18
24
  failure_status(e)
19
25
  end
20
26
 
21
- def discover(_connection_config)
22
- catalog_json = read_json(CATALOG_SPEC_PATH)
23
- catalog = build_catalog(catalog_json)
27
+ def discover(connection_config)
28
+ connection_config = connection_config.with_indifferent_access
29
+ streams = if unstructured_data?(connection_config)
30
+ [create_unstructured_stream]
31
+ elsif semistructured_data?(connection_config)
32
+ [create_semistructured_stream]
33
+ else
34
+ raise NotImplementedError, "Discovery failed: Structured data is not supported yet"
35
+ end
36
+ catalog = Catalog.new(streams: streams)
24
37
  catalog.to_multiwoven_message
25
- rescue StandardError => e
38
+ rescue StandardError, NotImplementedError => e
26
39
  handle_exception(e, {
27
40
  context: "GOOGLE_DRIVE:DISCOVER:EXCEPTION",
28
41
  type: "error"
@@ -31,12 +44,11 @@ module Multiwoven::Integrations::Source
31
44
 
32
45
  def read(sync_config)
33
46
  connection_config = sync_config.source.connection_specification.with_indifferent_access
34
- client = create_connection(connection_config)
35
- query = sync_config.model.query
36
- query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
37
- records = query(client, query)
38
- process_files(client, records)
39
- rescue StandardError => e
47
+
48
+ return handle_unstructured_data(sync_config) if unstructured_data?(connection_config) || semistructured_data?(connection_config)
49
+
50
+ raise NotImplementedError, "Read failed: Structured data is not supported yet"
51
+ rescue StandardError, NotImplementedError => e
40
52
  handle_exception(e, {
41
53
  context: "GOOGLE_DRIVE:READ:EXCEPTION",
42
54
  type: "error",
@@ -47,126 +59,104 @@ module Multiwoven::Integrations::Source
47
59
 
48
60
  private
49
61
 
50
- def query(client, query)
51
- limit = 0
52
- offset = 0
53
- query = query.gsub("\n", " ").gsub(/\s+/, " ")
54
- limit = query.match(/LIMIT (\d+)/)[1].to_i if query.include? "LIMIT"
55
- offset = query.match(/OFFSET (\d+)/)[1].to_i if query.include? "OFFSET"
56
- query = query.match(/\((.*)\) AS/)[1] if query.include? "AS subquery"
57
- columns = select_columns(query)
58
-
59
- google_drive_query = build_query(client)
60
- files = get_files(client, google_drive_query, limit, offset)
61
- files.map do |file|
62
- RecordMessage.new(data: prepare_invoice(file, columns), emitted_at: Time.now.to_i).to_multiwoven_message
63
- end
62
+ def create_connection(connection_config)
63
+ raise NotImplementedError, "Connection failed: Structured data is not supported yet"
64
64
  end
65
65
 
66
- # Reads files from Google Drive and sends them to Amazon Textract for analysis
67
- def process_files(client, records)
68
- textract = create_aws_textract_connection
69
- results = []
70
- records.each do |record|
71
- invoice = record.record.data
72
- begin
73
- temp_file = Tempfile.new(invoice["file_name"])
74
- client.get_file(invoice["id"], download_dest: temp_file.path)
75
-
76
- reader = PDF::Reader.new(temp_file)
77
- page_count = reader.page_count
78
-
79
- analysis = if page_count > 1
80
- start_expense_analysis(invoice["file_name"], temp_file)
81
- else
82
- [textract.analyze_expense(document: { bytes: File.binread(temp_file.path) })]
83
- end
66
+ def create_drive_connection(connection_config)
67
+ credentials = connection_config[:credentials_json]
68
+ @google_drive = Google::Apis::DriveV3::DriveService.new
69
+ @google_drive.authorization = Google::Auth::ServiceAccountCredentials.make_creds(
70
+ json_key_io: StringIO.new(credentials.to_json),
71
+ scope: GOOGLE_SHEETS_SCOPE
72
+ )
73
+ end
84
74
 
85
- invoice = extract_invoice_data(invoice, analysis)
86
- rescue Aws::Textract::Errors::UnsupportedDocumentException => e
87
- invoice["exception"] = e.message if invoice.key?("exception")
88
- handle_exception(e, {
89
- context: "GOOGLE_DRIVE:READ:EXTRACT:EXCEPTION",
90
- type: "error"
91
- })
92
- rescue StandardError => e
93
- handle_exception(e, {
94
- context: "GOOGLE_DRIVE:READ:EXTRACT:EXCEPTION",
95
- type: "error"
96
- })
97
- end
98
- results.append(RecordMessage.new(data: invoice, emitted_at: Time.now.to_i).to_multiwoven_message)
75
+ def handle_unstructured_data(sync_config)
76
+ connection_config = sync_config.source.connection_specification.with_indifferent_access
77
+ folder_name = connection_config[:folder_name]
78
+ command = sync_config.model.query.strip
79
+ create_drive_connection(connection_config)
80
+
81
+ case command
82
+ when LIST_FILES_CMD
83
+ list_files_in_folder(folder_name)
84
+ when /^#{DOWNLOAD_FILE_CMD}\s+(.+)$/
85
+ file_name = ::Regexp.last_match(1).strip
86
+ file_name = file_name.gsub(/^["']|["']$/, "") # Remove leading/trailing quotes
87
+ file_name = file_name.gsub("\\", "\\\\\\") # Escape backslashes
88
+ download_file_to_local(file_name, sync_config.sync_id)
89
+ else
90
+ raise ArgumentError, "Invalid command. Supported commands: #{LIST_FILES_CMD}, #{DOWNLOAD_FILE_CMD} <file_path>"
99
91
  end
100
- results
101
92
  end
102
93
 
103
- def start_expense_analysis(file_name, temp_file)
104
- bucket_name = ENV["TEXTRACT_BUCKET_NAME"]
105
- s3_client = create_aws_s3_connection
106
- textract = create_aws_textract_connection
107
-
108
- s3_client.put_object(
109
- bucket: bucket_name,
110
- key: file_name,
111
- body: temp_file
112
- )
113
-
114
- resp = textract.start_expense_analysis(
115
- document_location: {
116
- s3_object: {
117
- bucket: bucket_name,
118
- name: file_name
119
- }
120
- }
121
- )
122
-
123
- job_id = resp.job_id
124
- all_pages = []
125
- next_token = nil
126
-
127
- loop do
128
- result = textract.get_expense_analysis(
129
- job_id: job_id,
130
- next_token: next_token
131
- )
132
-
133
- status = result.job_status
134
- if status == "SUCCEEDED"
135
- all_pages << result
136
- next_token = result.next_token
137
- break unless next_token
138
- elsif %w[FAILED PARTIAL_SUCCESS].include?(status)
139
- raise "Textract job ended with status: #{status}"
140
- else
141
- sleep 2 # still IN_PROGRESS; wait briefly and try again
142
- end
94
+ def list_files_in_folder(folder_name)
95
+ query = build_query(folder_name)
96
+ records = get_files(@google_drive, query, 10_000, 0)
97
+ records.map do |row|
98
+ RecordMessage.new(
99
+ data: {
100
+ element_id: row.id,
101
+ file_name: row.name,
102
+ file_path: row.name,
103
+ size: row.size,
104
+ file_type: row.file_extension,
105
+ created_date: row.created_time,
106
+ modified_date: row.modified_time,
107
+ text: ""
108
+ },
109
+ emitted_at: Time.now.to_i
110
+ ).to_multiwoven_message
143
111
  end
144
- all_pages
145
112
  end
146
113
 
147
- def build_query(client)
148
- query = "mimeType != 'application/vnd.google-apps.folder'"
149
-
150
- if @options[:folder]
151
- folder_query = "#{MIMETYPE_GOOGLE_DRIVE_FOLDER} and (name = '#{@options[:folder]}')"
152
- response = client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: folder_query, fields: FIELDS)
153
- raise "Specified folder does not exist" if response.files.empty?
114
+ def download_file_to_local(file_name, sync_id)
115
+ download_path = ENV["FILE_DOWNLOAD_PATH"]
116
+ file = if download_path
117
+ File.join(download_path, "syncs", sync_id, File.basename(file_name))
118
+ else
119
+ Tempfile.new(["google_drive_file_syncs_#{sync_id}", File.extname(file_name)]).path
120
+ end
121
+
122
+ # Escape single quotes to prevent query injection
123
+ escaped_name = file_name.gsub("'", "\\\\'")
124
+ query = "mimeType != 'application/vnd.google-apps.folder' and name = '#{escaped_name}'"
125
+
126
+ records = get_files(@google_drive, query, 1, 0)
127
+ raise StandardError, "File not found." if records.empty?
128
+
129
+ @google_drive.get_file(records.first.id, download_dest: file)
130
+
131
+ [RecordMessage.new(
132
+ data: {
133
+ element_id: records.first.id,
134
+ local_path: file,
135
+ file_name: file_name,
136
+ file_path: file_name,
137
+ size: records.first.size,
138
+ file_type: records.first.file_extension,
139
+ created_date: records.first.created_time,
140
+ modified_date: records.first.modified_time,
141
+ text: ""
142
+ },
143
+ emitted_at: Time.now.to_i
144
+ ).to_multiwoven_message]
145
+ rescue StandardError => e
146
+ raise StandardError, "Failed to download file #{file_name}: #{e.message}"
147
+ end
154
148
 
155
- parent_id = response.files.first.id
156
- parents_query = "'#{parent_id}' in parents"
157
- end
149
+ def build_query(folder_name)
150
+ raise ArgumentError, "Folder name is required" if folder_name.blank?
158
151
 
159
- if @options[:subfolders]
160
- subfolders_query = MIMETYPE_GOOGLE_DRIVE_FOLDER
161
- subfolders_query += "and #{parents_query}" if parents_query
162
- response = client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: subfolders_query, fields: FIELDS)
163
- subfolders_ids = response.files.map { |file| "'#{file.id}'" }
164
- parents_query = "(#{subfolders_ids.join(" in parents or ")} in parents)"
165
- end
152
+ # Escape single quotes to prevent query injection
153
+ escaped_folder = folder_name.gsub("'", "\\\\'")
154
+ folder_query = "#{MIMETYPE_GOOGLE_DRIVE_FOLDER} and (name = '#{escaped_folder}')"
155
+ response = @google_drive.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: folder_query, fields: FIELDS)
156
+ raise ArgumentError, "Specified folder does not exist" if response.files.empty?
166
157
 
167
- query += " and mimeType = '#{@options[:file_type]}'" if @options[:file_type]
168
- query += " and #{parents_query}" if parents_query
169
- query
158
+ parent_id = response.files.first.id
159
+ "'#{parent_id}' in parents"
170
160
  end
171
161
 
172
162
  def get_files(client, query, limit, offset)
@@ -193,100 +183,6 @@ module Multiwoven::Integrations::Source
193
183
 
194
184
  result
195
185
  end
196
-
197
- def select_columns(query)
198
- columns = query.match(/SELECT (.*) FROM/)[1]
199
- all_columns = %w[line_items id file_name exception results] + TEXTRACT_SUMMARY_FIELDS.keys
200
- @options[:fields] = all_columns if @options[:fields].empty?
201
-
202
- return @options[:fields] if columns.include?("*")
203
-
204
- columns = columns.split(",").map(&:strip)
205
- raise "Column(s) #{(columns - all_columns).join(", ")} not valid." if (columns - all_columns).length.positive?
206
-
207
- columns & all_columns
208
- end
209
-
210
- def prepare_invoice(file, columns)
211
- invoice = {}
212
- columns.each { |column| invoice[column] = "" if TEXTRACT_SUMMARY_FIELDS.key?(column) }
213
- invoice["line_items"] = [] if columns.any?("line_items")
214
- invoice["id"] = file.id if columns.any?("id")
215
- invoice["file_name"] = file.name if columns.any?("file_name")
216
- invoice["exception"] = "" if columns.any?("exception")
217
- invoice["results"] = {} if columns.any?("results")
218
- invoice
219
- end
220
-
221
- def create_connection(connection_config)
222
- @options = connection_config[:options]
223
- credentials = connection_config[:credentials_json]
224
- client = Google::Apis::DriveV3::DriveService.new
225
- client.authorization = Google::Auth::ServiceAccountCredentials.make_creds(
226
- json_key_io: StringIO.new(credentials.to_json),
227
- scope: GOOGLE_SHEETS_SCOPE
228
- )
229
- client
230
- end
231
-
232
- # TODO: Refactor (extract) code for Amazon Textract
233
- def create_aws_credentials
234
- access_key_id = ENV["TEXTRACT_ACCESS_KEY_ID"]
235
- secret_access_key = ENV["TEXTRACT_SECRET_ACCESS_KEY"]
236
- Aws::Credentials.new(access_key_id, secret_access_key)
237
- end
238
-
239
- def create_aws_textract_connection
240
- region = ENV["TEXTRACT_REGION"]
241
- credentials = create_aws_credentials
242
- Aws::Textract::Client.new(region: region, credentials: credentials)
243
- end
244
-
245
- def create_aws_s3_connection
246
- region = ENV["TEXTRACT_REGION"]
247
- credentials = create_aws_credentials
248
- Aws::S3::Client.new(region: region, credentials: credentials)
249
- end
250
-
251
- def extract_invoice_data(invoice, results)
252
- invoice = extract_summary_fields(invoice, results)
253
- invoice = extract_line_items(invoice, results)
254
- invoice["results"] = results.to_json if invoice.key?("results")
255
- invoice.transform_keys(&:to_sym)
256
- end
257
-
258
- def extract_summary_fields(invoice, results)
259
- document = results[0].expense_documents[0]
260
- (invoice.keys & TEXTRACT_SUMMARY_FIELDS.keys).each do |key|
261
- invoice[key] = extract_field_value(document.summary_fields, TEXTRACT_SUMMARY_FIELDS[key])
262
- end
263
- invoice
264
- end
265
-
266
- def extract_line_items(invoice, results)
267
- if invoice.key?("line_items")
268
- results.each do |result|
269
- result.expense_documents.each do |expense_document|
270
- expense_document.line_item_groups.each do |line_item_group|
271
- line_item_group.line_items.each do |line_item|
272
- extracted_line_item = {}
273
- TEXTRACT_LINE_ITEMS_FIELDS.each_key do |key|
274
- extracted_line_item[key] = extract_field_value(line_item.line_item_expense_fields, TEXTRACT_LINE_ITEMS_FIELDS[key])
275
- end
276
- invoice["line_items"] << extracted_line_item
277
- end
278
- end
279
- end
280
- end
281
- end
282
- invoice["line_items"] = invoice["line_items"].to_json
283
- invoice
284
- end
285
-
286
- def extract_field_value(fields, selector)
287
- selected_field = fields.select { |field| field.type.text == selector }.first
288
- selected_field ? selected_field.value_detection.text : ""
289
- end
290
186
  end
291
187
  end
292
188
  end
@@ -8,6 +8,16 @@
8
8
  "type": "object",
9
9
  "required": ["credentials_json"],
10
10
  "properties": {
11
+ "data_type": {
12
+ "description": "Type of data in files",
13
+ "type": "string",
14
+ "title": "Data Format Type",
15
+ "oneOf": [
16
+ { "const": "structured", "title": "Tables & Records (Structured)" },
17
+ { "const": "unstructured", "title": "Documents & Files (Unstructured)" },
18
+ { "const": "semistructured", "title": "Invoices, Receipts, Tables, Forms (Semi-structured)" }
19
+ ]
20
+ },
11
21
  "credentials_json": {
12
22
  "type": "object",
13
23
  "description": "You can get the keys from the Google Cloud web console. First, go to the IAM page and select Service Accounts from the left menu. Next, locate your service account in the list, click on its Keys tab, and then click Add Key. Lastly, click Create new key and select JSON.",
@@ -67,55 +77,10 @@
67
77
  "universe_domain"
68
78
  ]
69
79
  },
70
- "options": {
71
- "type": "object",
72
- "description": "When the subfolders option is set to true, files will be read from the subfolders of the root or specified folder.",
73
- "title": "Options",
74
- "required": ["document_type"],
75
- "properties": {
76
- "folder": {
77
- "type": "string",
78
- "description": "When specified, reads files contained in the folder.",
79
- "title": "Folder"
80
- },
81
- "subfolders": {
82
- "type": "boolean",
83
- "default": false,
84
- "title": "Read from subfolders"
85
- },
86
- "file_type": {
87
- "description": "The type of file to read",
88
- "type": "string",
89
- "title": "File Type",
90
- "enum": [
91
- "application/pdf"
92
- ]
93
- },
94
- "document_type": {
95
- "type": "string",
96
- "enum": [ "invoices" ]
97
- },
98
- "fields": {
99
- "type": "array",
100
- "description": "Leave blank to extract all fields.",
101
- "items": {
102
- "type": "string",
103
- "anyOf": [
104
- { "const": "id", "title": "File Id" },
105
- { "const": "file_name", "title": "File Name" },
106
- { "const": "exception", "title": "Parsing Exception" },
107
- { "const": "invoice_number", "title": "Invoice/Receipt Number" },
108
- { "const": "invoice_date", "title": "Invoice/Receipt Date" },
109
- { "const": "invoice_total", "title": "Invoice/Receipt Total" },
110
- { "const": "purchase_order", "title": "Purchase Order Number" },
111
- { "const": "line_items", "title": "Invoice Line Items (Code, Description, Quantity, Unit Price, Total Price)"},
112
- { "const": "vendor_name", "title": "Vendor Name" },
113
- { "const": "results", "title": "Parsing Results" }
114
- ]
115
- },
116
- "uniqueItems": true
117
- }
118
- }
80
+ "folder_name": {
81
+ "description": "Name of folder to ready files from",
82
+ "type": "string",
83
+ "title": "Folder Name"
119
84
  }
120
85
  }
121
86
  }
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: multiwoven-integrations
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.34.8
4
+ version: 0.34.10
5
5
  platform: ruby
6
6
  authors:
7
7
  - Subin T P
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-11-18 00:00:00.000000000 Z
11
+ date: 2025-11-19 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport