multiwoven-integrations 0.34.8 → 0.34.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 68782aec4005294c631d307cf3b4fbff160ca94db2321cce3dd92b7b4b11e0c4
4
- data.tar.gz: ee482c5b9c92c123e3c2901313e84b02ce2f3f027fd279baa4e9d826e2fd7f8c
3
+ metadata.gz: 5c72a9da0aeb56e15b4ba607cb2aaa835c0bd82fc35f21acbfa00fd31aa7d0f9
4
+ data.tar.gz: 66a6b214675cd30fb9e7480c17e507f743030240b76fad18f7e87b180ced54ad
5
5
  SHA512:
6
- metadata.gz: 8881241bd2eeab50a20881892784f7b27a2e601ecafbbc006981695d2e00b6475762b71a8810c3c672b6c77624bdb5b08e0e35dcc397d7fec25098ee566846e1
7
- data.tar.gz: 465090979a77c9ab8e70bf1c11771f284392d112d6316388ef78f67d44028628ff495daa195f8550b72edf9f3ccc7c359b36a6770879925e2602508c4eff6db1
6
+ metadata.gz: b1085ca81e3b5701580426bc7046b8d4b8c4d5460ca38f59ca5ff7ec6eb929fa2a60e302aefb2ccc17163e00aaab4d220c591c9a156689fb9391dc4cf9f65c1d
7
+ data.tar.gz: e10ccc0be176977c96a1fbf9c6cda6e0baab3ee9050b6401250f619ed2f72314663ff271e1c78cbdc6be73bd1eedf72891437732d44a5b3e97fcc7e20687bbdf
@@ -30,8 +30,11 @@ module Multiwoven
30
30
  source_defined_primary_key: [["element_id"]]
31
31
  }.freeze
32
32
 
33
- # Commands for unstructured data operations
33
+ # Data types
34
34
  UNSTRUCTURED = "unstructured"
35
+ SEMISTRUCTURED = "semistructured"
36
+
37
+ # Commands for unstructured & semi-structured data operations
35
38
  LIST_FILES_CMD = "list_files"
36
39
  DOWNLOAD_FILE_CMD = "download_file"
37
40
 
@@ -39,6 +42,10 @@ module Multiwoven
39
42
  connection_config["data_type"] == UNSTRUCTURED
40
43
  end
41
44
 
45
+ def semistructured_data?(connection_config)
46
+ connection_config["data_type"] == SEMISTRUCTURED
47
+ end
48
+
42
49
  def create_unstructured_stream
43
50
  Multiwoven::Integrations::Protocol::Stream.new(
44
51
  name: UNSTRUCTURED,
@@ -47,6 +54,15 @@ module Multiwoven
47
54
  **UNSTRUCTURED_STREAM_CONFIG
48
55
  )
49
56
  end
57
+
58
+ def create_semistructured_stream
59
+ Multiwoven::Integrations::Protocol::Stream.new(
60
+ name: SEMISTRUCTURED,
61
+ action: StreamAction["fetch"],
62
+ json_schema: UNSTRUCTURED_SCHEMA,
63
+ **UNSTRUCTURED_STREAM_CONFIG
64
+ )
65
+ end
50
66
  end
51
67
  end
52
68
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Multiwoven
4
4
  module Integrations
5
- VERSION = "0.34.8"
5
+ VERSION = "0.34.9"
6
6
 
7
7
  ENABLED_SOURCES = %w[
8
8
  Snowflake
@@ -4,25 +4,36 @@ module Multiwoven::Integrations::Source
4
4
  module GoogleDrive
5
5
  include Multiwoven::Integrations::Core
6
6
 
7
- FIELDS = "files(id, name, parents, mimeType), nextPageToken"
7
+ FIELDS = "files(id, name, parents, mimeType, fileExtension, size, createdTime, modifiedTime), nextPageToken"
8
8
  MAX_PER_PAGE = 1000
9
9
  MIMETYPE_GOOGLE_DRIVE_FOLDER = "mimeType = 'application/vnd.google-apps.folder'"
10
10
 
11
- class Client < SourceConnector
11
+ class Client < UnstructuredSourceConnector
12
12
  def check_connection(connection_config)
13
13
  connection_config = connection_config.with_indifferent_access
14
- client = create_connection(connection_config)
15
- build_query(client)
14
+
15
+ if unstructured_data?(connection_config) || semistructured_data?(connection_config)
16
+ create_drive_connection(connection_config)
17
+ else
18
+ create_connection(connection_config)
19
+ end
16
20
  success_status
17
- rescue StandardError => e
21
+ rescue StandardError, NotImplementedError => e
18
22
  failure_status(e)
19
23
  end
20
24
 
21
- def discover(_connection_config)
22
- catalog_json = read_json(CATALOG_SPEC_PATH)
23
- catalog = build_catalog(catalog_json)
25
+ def discover(connection_config)
26
+ connection_config = connection_config.with_indifferent_access
27
+ streams = if unstructured_data?(connection_config)
28
+ [create_unstructured_stream]
29
+ elsif semistructured_data?(connection_config)
30
+ [create_semistructured_stream]
31
+ else
32
+ raise NotImplementedError, "Discovery failed: Structured data is not supported yet"
33
+ end
34
+ catalog = Catalog.new(streams: streams)
24
35
  catalog.to_multiwoven_message
25
- rescue StandardError => e
36
+ rescue StandardError, NotImplementedError => e
26
37
  handle_exception(e, {
27
38
  context: "GOOGLE_DRIVE:DISCOVER:EXCEPTION",
28
39
  type: "error"
@@ -31,12 +42,11 @@ module Multiwoven::Integrations::Source
31
42
 
32
43
  def read(sync_config)
33
44
  connection_config = sync_config.source.connection_specification.with_indifferent_access
34
- client = create_connection(connection_config)
35
- query = sync_config.model.query
36
- query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
37
- records = query(client, query)
38
- process_files(client, records)
39
- rescue StandardError => e
45
+
46
+ return handle_unstructured_data(sync_config) if unstructured_data?(connection_config) || semistructured_data?(connection_config)
47
+
48
+ raise NotImplementedError, "Read failed: Structured data is not supported yet"
49
+ rescue StandardError, NotImplementedError => e
40
50
  handle_exception(e, {
41
51
  context: "GOOGLE_DRIVE:READ:EXCEPTION",
42
52
  type: "error",
@@ -47,126 +57,103 @@ module Multiwoven::Integrations::Source
47
57
 
48
58
  private
49
59
 
50
- def query(client, query)
51
- limit = 0
52
- offset = 0
53
- query = query.gsub("\n", " ").gsub(/\s+/, " ")
54
- limit = query.match(/LIMIT (\d+)/)[1].to_i if query.include? "LIMIT"
55
- offset = query.match(/OFFSET (\d+)/)[1].to_i if query.include? "OFFSET"
56
- query = query.match(/\((.*)\) AS/)[1] if query.include? "AS subquery"
57
- columns = select_columns(query)
58
-
59
- google_drive_query = build_query(client)
60
- files = get_files(client, google_drive_query, limit, offset)
61
- files.map do |file|
62
- RecordMessage.new(data: prepare_invoice(file, columns), emitted_at: Time.now.to_i).to_multiwoven_message
63
- end
60
+ def create_connection(connection_config)
61
+ raise NotImplementedError, "Connection failed: Structured data is not supported yet"
64
62
  end
65
63
 
66
- # Reads files from Google Drive and sends them to Amazon Textract for analysis
67
- def process_files(client, records)
68
- textract = create_aws_textract_connection
69
- results = []
70
- records.each do |record|
71
- invoice = record.record.data
72
- begin
73
- temp_file = Tempfile.new(invoice["file_name"])
74
- client.get_file(invoice["id"], download_dest: temp_file.path)
75
-
76
- reader = PDF::Reader.new(temp_file)
77
- page_count = reader.page_count
78
-
79
- analysis = if page_count > 1
80
- start_expense_analysis(invoice["file_name"], temp_file)
81
- else
82
- [textract.analyze_expense(document: { bytes: File.binread(temp_file.path) })]
83
- end
64
+ def create_drive_connection(connection_config)
65
+ credentials = connection_config[:credentials_json]
66
+ @google_drive = Google::Apis::DriveV3::DriveService.new
67
+ @google_drive.authorization = Google::Auth::ServiceAccountCredentials.make_creds(
68
+ json_key_io: StringIO.new(credentials.to_json),
69
+ scope: GOOGLE_SHEETS_SCOPE
70
+ )
71
+ end
84
72
 
85
- invoice = extract_invoice_data(invoice, analysis)
86
- rescue Aws::Textract::Errors::UnsupportedDocumentException => e
87
- invoice["exception"] = e.message if invoice.key?("exception")
88
- handle_exception(e, {
89
- context: "GOOGLE_DRIVE:READ:EXTRACT:EXCEPTION",
90
- type: "error"
91
- })
92
- rescue StandardError => e
93
- handle_exception(e, {
94
- context: "GOOGLE_DRIVE:READ:EXTRACT:EXCEPTION",
95
- type: "error"
96
- })
97
- end
98
- results.append(RecordMessage.new(data: invoice, emitted_at: Time.now.to_i).to_multiwoven_message)
73
+ def handle_unstructured_data(sync_config)
74
+ connection_config = sync_config.source.connection_specification.with_indifferent_access
75
+ folder_name = connection_config[:folder_name]
76
+ command = sync_config.model.query.strip
77
+ create_drive_connection(connection_config)
78
+
79
+ case command
80
+ when LIST_FILES_CMD
81
+ list_files_in_folder(folder_name)
82
+ when /^#{DOWNLOAD_FILE_CMD}\s+(.+)$/
83
+ file_name = ::Regexp.last_match(1).strip
84
+ file_name = file_name.gsub(/^["']|["']$/, "") # Remove leading/trailing quotes
85
+ download_file_to_local(file_name, sync_config.sync_id)
86
+ else
87
+ raise ArgumentError, "Invalid command. Supported commands: #{LIST_FILES_CMD}, #{DOWNLOAD_FILE_CMD} <file_path>"
99
88
  end
100
- results
101
89
  end
102
90
 
103
- def start_expense_analysis(file_name, temp_file)
104
- bucket_name = ENV["TEXTRACT_BUCKET_NAME"]
105
- s3_client = create_aws_s3_connection
106
- textract = create_aws_textract_connection
107
-
108
- s3_client.put_object(
109
- bucket: bucket_name,
110
- key: file_name,
111
- body: temp_file
112
- )
113
-
114
- resp = textract.start_expense_analysis(
115
- document_location: {
116
- s3_object: {
117
- bucket: bucket_name,
118
- name: file_name
119
- }
120
- }
121
- )
122
-
123
- job_id = resp.job_id
124
- all_pages = []
125
- next_token = nil
126
-
127
- loop do
128
- result = textract.get_expense_analysis(
129
- job_id: job_id,
130
- next_token: next_token
131
- )
132
-
133
- status = result.job_status
134
- if status == "SUCCEEDED"
135
- all_pages << result
136
- next_token = result.next_token
137
- break unless next_token
138
- elsif %w[FAILED PARTIAL_SUCCESS].include?(status)
139
- raise "Textract job ended with status: #{status}"
140
- else
141
- sleep 2 # still IN_PROGRESS; wait briefly and try again
142
- end
91
+ def list_files_in_folder(folder_name)
92
+ query = build_query(folder_name)
93
+ records = get_files(@google_drive, query, 10_000, 0)
94
+ records.map do |row|
95
+ RecordMessage.new(
96
+ data: {
97
+ element_id: row.id,
98
+ file_name: row.name,
99
+ file_path: row.name,
100
+ size: row.size,
101
+ file_type: row.file_extension,
102
+ created_date: row.created_time,
103
+ modified_date: row.modified_time,
104
+ text: ""
105
+ },
106
+ emitted_at: Time.now.to_i
107
+ ).to_multiwoven_message
143
108
  end
144
- all_pages
145
109
  end
146
110
 
147
- def build_query(client)
148
- query = "mimeType != 'application/vnd.google-apps.folder'"
149
-
150
- if @options[:folder]
151
- folder_query = "#{MIMETYPE_GOOGLE_DRIVE_FOLDER} and (name = '#{@options[:folder]}')"
152
- response = client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: folder_query, fields: FIELDS)
153
- raise "Specified folder does not exist" if response.files.empty?
111
+ def download_file_to_local(file_name, sync_id)
112
+ download_path = ENV["FILE_DOWNLOAD_PATH"]
113
+ file = if download_path
114
+ File.join(download_path, "syncs", sync_id, File.basename(file_name))
115
+ else
116
+ Tempfile.new(["google_drive_file_syncs_#{sync_id}", File.extname(file_name)]).path
117
+ end
118
+
119
+ # Escape single quotes to prevent query injection
120
+ escaped_name = file_name.gsub("'", "\\\\'")
121
+ query = "mimeType != 'application/vnd.google-apps.folder' and name = '#{escaped_name}'"
122
+
123
+ records = get_files(@google_drive, query, 1, 0)
124
+ raise StandardError, "File not found." if records.empty?
125
+
126
+ @google_drive.get_file(records.first.id, download_dest: file)
127
+
128
+ [RecordMessage.new(
129
+ data: {
130
+ element_id: records.first.id,
131
+ local_path: file,
132
+ file_name: file_name,
133
+ file_path: file_name,
134
+ size: records.first.size,
135
+ file_type: records.first.file_extension,
136
+ created_date: records.first.created_time,
137
+ modified_date: records.first.modified_time,
138
+ text: ""
139
+ },
140
+ emitted_at: Time.now.to_i
141
+ ).to_multiwoven_message]
142
+ rescue StandardError => e
143
+ raise StandardError, "Failed to download file #{file_name}: #{e.message}"
144
+ end
154
145
 
155
- parent_id = response.files.first.id
156
- parents_query = "'#{parent_id}' in parents"
157
- end
146
+ def build_query(folder_name)
147
+ raise ArgumentError, "Folder name is required" if folder_name.blank?
158
148
 
159
- if @options[:subfolders]
160
- subfolders_query = MIMETYPE_GOOGLE_DRIVE_FOLDER
161
- subfolders_query += "and #{parents_query}" if parents_query
162
- response = client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: subfolders_query, fields: FIELDS)
163
- subfolders_ids = response.files.map { |file| "'#{file.id}'" }
164
- parents_query = "(#{subfolders_ids.join(" in parents or ")} in parents)"
165
- end
149
+ # Escape single quotes to prevent query injection
150
+ escaped_folder = folder_name.gsub("'", "\\\\'")
151
+ folder_query = "#{MIMETYPE_GOOGLE_DRIVE_FOLDER} and (name = '#{escaped_folder}')"
152
+ response = @google_drive.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: folder_query, fields: FIELDS)
153
+ raise ArgumentError, "Specified folder does not exist" if response.files.empty?
166
154
 
167
- query += " and mimeType = '#{@options[:file_type]}'" if @options[:file_type]
168
- query += " and #{parents_query}" if parents_query
169
- query
155
+ parent_id = response.files.first.id
156
+ "'#{parent_id}' in parents"
170
157
  end
171
158
 
172
159
  def get_files(client, query, limit, offset)
@@ -193,100 +180,6 @@ module Multiwoven::Integrations::Source
193
180
 
194
181
  result
195
182
  end
196
-
197
- def select_columns(query)
198
- columns = query.match(/SELECT (.*) FROM/)[1]
199
- all_columns = %w[line_items id file_name exception results] + TEXTRACT_SUMMARY_FIELDS.keys
200
- @options[:fields] = all_columns if @options[:fields].empty?
201
-
202
- return @options[:fields] if columns.include?("*")
203
-
204
- columns = columns.split(",").map(&:strip)
205
- raise "Column(s) #{(columns - all_columns).join(", ")} not valid." if (columns - all_columns).length.positive?
206
-
207
- columns & all_columns
208
- end
209
-
210
- def prepare_invoice(file, columns)
211
- invoice = {}
212
- columns.each { |column| invoice[column] = "" if TEXTRACT_SUMMARY_FIELDS.key?(column) }
213
- invoice["line_items"] = [] if columns.any?("line_items")
214
- invoice["id"] = file.id if columns.any?("id")
215
- invoice["file_name"] = file.name if columns.any?("file_name")
216
- invoice["exception"] = "" if columns.any?("exception")
217
- invoice["results"] = {} if columns.any?("results")
218
- invoice
219
- end
220
-
221
- def create_connection(connection_config)
222
- @options = connection_config[:options]
223
- credentials = connection_config[:credentials_json]
224
- client = Google::Apis::DriveV3::DriveService.new
225
- client.authorization = Google::Auth::ServiceAccountCredentials.make_creds(
226
- json_key_io: StringIO.new(credentials.to_json),
227
- scope: GOOGLE_SHEETS_SCOPE
228
- )
229
- client
230
- end
231
-
232
- # TODO: Refactor (extract) code for Amazon Textract
233
- def create_aws_credentials
234
- access_key_id = ENV["TEXTRACT_ACCESS_KEY_ID"]
235
- secret_access_key = ENV["TEXTRACT_SECRET_ACCESS_KEY"]
236
- Aws::Credentials.new(access_key_id, secret_access_key)
237
- end
238
-
239
- def create_aws_textract_connection
240
- region = ENV["TEXTRACT_REGION"]
241
- credentials = create_aws_credentials
242
- Aws::Textract::Client.new(region: region, credentials: credentials)
243
- end
244
-
245
- def create_aws_s3_connection
246
- region = ENV["TEXTRACT_REGION"]
247
- credentials = create_aws_credentials
248
- Aws::S3::Client.new(region: region, credentials: credentials)
249
- end
250
-
251
- def extract_invoice_data(invoice, results)
252
- invoice = extract_summary_fields(invoice, results)
253
- invoice = extract_line_items(invoice, results)
254
- invoice["results"] = results.to_json if invoice.key?("results")
255
- invoice.transform_keys(&:to_sym)
256
- end
257
-
258
- def extract_summary_fields(invoice, results)
259
- document = results[0].expense_documents[0]
260
- (invoice.keys & TEXTRACT_SUMMARY_FIELDS.keys).each do |key|
261
- invoice[key] = extract_field_value(document.summary_fields, TEXTRACT_SUMMARY_FIELDS[key])
262
- end
263
- invoice
264
- end
265
-
266
- def extract_line_items(invoice, results)
267
- if invoice.key?("line_items")
268
- results.each do |result|
269
- result.expense_documents.each do |expense_document|
270
- expense_document.line_item_groups.each do |line_item_group|
271
- line_item_group.line_items.each do |line_item|
272
- extracted_line_item = {}
273
- TEXTRACT_LINE_ITEMS_FIELDS.each_key do |key|
274
- extracted_line_item[key] = extract_field_value(line_item.line_item_expense_fields, TEXTRACT_LINE_ITEMS_FIELDS[key])
275
- end
276
- invoice["line_items"] << extracted_line_item
277
- end
278
- end
279
- end
280
- end
281
- end
282
- invoice["line_items"] = invoice["line_items"].to_json
283
- invoice
284
- end
285
-
286
- def extract_field_value(fields, selector)
287
- selected_field = fields.select { |field| field.type.text == selector }.first
288
- selected_field ? selected_field.value_detection.text : ""
289
- end
290
183
  end
291
184
  end
292
185
  end
@@ -8,6 +8,16 @@
8
8
  "type": "object",
9
9
  "required": ["credentials_json"],
10
10
  "properties": {
11
+ "data_type": {
12
+ "description": "Type of data in files",
13
+ "type": "string",
14
+ "title": "Data Format Type",
15
+ "oneOf": [
16
+ { "const": "structured", "title": "Tables & Records (Structured)" },
17
+ { "const": "unstructured", "title": "Documents & Files (Unstructured)" },
18
+ { "const": "semistructured", "title": "Invoices, Receipts, Tables, Forms (Semi-structured)" }
19
+ ]
20
+ },
11
21
  "credentials_json": {
12
22
  "type": "object",
13
23
  "description": "You can get the keys from the Google Cloud web console. First, go to the IAM page and select Service Accounts from the left menu. Next, locate your service account in the list, click on its Keys tab, and then click Add Key. Lastly, click Create new key and select JSON.",
@@ -67,55 +77,10 @@
67
77
  "universe_domain"
68
78
  ]
69
79
  },
70
- "options": {
71
- "type": "object",
72
- "description": "When the subfolders option is set to true, files will be read from the subfolders of the root or specified folder.",
73
- "title": "Options",
74
- "required": ["document_type"],
75
- "properties": {
76
- "folder": {
77
- "type": "string",
78
- "description": "When specified, reads files contained in the folder.",
79
- "title": "Folder"
80
- },
81
- "subfolders": {
82
- "type": "boolean",
83
- "default": false,
84
- "title": "Read from subfolders"
85
- },
86
- "file_type": {
87
- "description": "The type of file to read",
88
- "type": "string",
89
- "title": "File Type",
90
- "enum": [
91
- "application/pdf"
92
- ]
93
- },
94
- "document_type": {
95
- "type": "string",
96
- "enum": [ "invoices" ]
97
- },
98
- "fields": {
99
- "type": "array",
100
- "description": "Leave blank to extract all fields.",
101
- "items": {
102
- "type": "string",
103
- "anyOf": [
104
- { "const": "id", "title": "File Id" },
105
- { "const": "file_name", "title": "File Name" },
106
- { "const": "exception", "title": "Parsing Exception" },
107
- { "const": "invoice_number", "title": "Invoice/Receipt Number" },
108
- { "const": "invoice_date", "title": "Invoice/Receipt Date" },
109
- { "const": "invoice_total", "title": "Invoice/Receipt Total" },
110
- { "const": "purchase_order", "title": "Purchase Order Number" },
111
- { "const": "line_items", "title": "Invoice Line Items (Code, Description, Quantity, Unit Price, Total Price)"},
112
- { "const": "vendor_name", "title": "Vendor Name" },
113
- { "const": "results", "title": "Parsing Results" }
114
- ]
115
- },
116
- "uniqueItems": true
117
- }
118
- }
80
+ "folder_name": {
81
+ "description": "Name of folder to ready files from",
82
+ "type": "string",
83
+ "title": "Folder Name"
119
84
  }
120
85
  }
121
86
  }
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: multiwoven-integrations
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.34.8
4
+ version: 0.34.9
5
5
  platform: ruby
6
6
  authors:
7
7
  - Subin T P