multiwoven-integrations 0.34.2 → 0.34.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e61c82f1e31623d9970165b02d0de53fd83e9a884e5d98d48ca4328d1ece23aa
4
- data.tar.gz: b37126ee6335f832709bc27697662dc3262fac19f78cfb5aa2d2f7b94937e10d
3
+ metadata.gz: 26de49e95f5e42ee120a8ff08aa2e6690fac256496870b643db9afea71ff221c
4
+ data.tar.gz: b1af60ffa953977ae791a67d07931726b03cf58d637be8b8ca3a826760a94bff
5
5
  SHA512:
6
- metadata.gz: 4f81b6d4c90067d74be610b31bdfdfbd9bdf8fb60a9e6dbe56e872b7dc3b2ecfcabb28f6b4ab98e5c5c092f8fcfa2aaccbc9154f842af8906d105b63f4467a20
7
- data.tar.gz: b2063c186173f056cebe9d51623a72c7c373bd59cc12f7065c7eb0e115d9332af4ee240f9481e4d19dd7ac7fead5edfd311c536dfadfc05a36f81daa4a9c6b8c
6
+ metadata.gz: 0c159fcc1b9bb3b28a66a0c071a0adb2998e68134752cc865b0e3f70e953732590719cb0996991c96c3c8fa038b0c03870e88b396b65c385b3b6c4387f58ee2a
7
+ data.tar.gz: 23a1b467a73c242cf8b00a209e95e6623765e0ef54a9406bfe2ccff51a599268e63adc0d0a0b3774a921035ef9a6f7bb3450c683f628e4e680b469386363f327
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Multiwoven
4
4
  module Integrations
5
- VERSION = "0.34.2"
5
+ VERSION = "0.34.3"
6
6
 
7
7
  ENABLED_SOURCES = %w[
8
8
  Snowflake
@@ -35,7 +35,7 @@ module Multiwoven::Integrations::Source
35
35
  query = sync_config.model.query
36
36
  query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
37
37
  records = query(client, query)
38
- analyze_expenses(client, records)
38
+ process_files(client, records)
39
39
  rescue StandardError => e
40
40
  handle_exception(e, {
41
41
  context: "GOOGLE_DRIVE:READ:EXCEPTION",
@@ -64,16 +64,24 @@ module Multiwoven::Integrations::Source
64
64
  end
65
65
 
66
66
  # Reads files from Google Drive and sends them to Amazon Textract for analysis
67
- def analyze_expenses(client, records)
68
- textract = create_aws_connection
67
+ def process_files(client, records)
68
+ textract = create_aws_textract_connection
69
69
  results = []
70
70
  records.each do |record|
71
71
  invoice = record.record.data
72
72
  begin
73
- byte_stream = StringIO.new
74
- client.get_file(invoice["id"], download_dest: byte_stream)
75
- byte_stream.rewind
76
- analysis = textract.analyze_expense(document: { bytes: byte_stream.read })
73
+ temp_file = Tempfile.new(invoice["file_name"])
74
+ client.get_file(invoice["id"], download_dest: temp_file.path)
75
+
76
+ reader = PDF::Reader.new(temp_file)
77
+ page_count = reader.page_count
78
+
79
+ analysis = if page_count > 1
80
+ start_expense_analysis(invoice["file_name"], temp_file)
81
+ else
82
+ [textract.analyze_expense(document: { bytes: File.binread(temp_file.path) })]
83
+ end
84
+
77
85
  invoice = extract_invoice_data(invoice, analysis)
78
86
  rescue Aws::Textract::Errors::UnsupportedDocumentException => e
79
87
  invoice["exception"] = e.message if invoice.key?("exception")
@@ -92,6 +100,50 @@ module Multiwoven::Integrations::Source
92
100
  results
93
101
  end
94
102
 
103
+ def start_expense_analysis(file_name, temp_file)
104
+ bucket_name = ENV["TEXTRACT_BUCKET_NAME"]
105
+ s3_client = create_aws_s3_connection
106
+ textract = create_aws_textract_connection
107
+
108
+ s3_client.put_object(
109
+ bucket: bucket_name,
110
+ key: file_name,
111
+ body: temp_file
112
+ )
113
+
114
+ resp = textract.start_expense_analysis(
115
+ document_location: {
116
+ s3_object: {
117
+ bucket: bucket_name,
118
+ name: file_name
119
+ }
120
+ }
121
+ )
122
+
123
+ job_id = resp.job_id
124
+ all_pages = []
125
+ next_token = nil
126
+
127
+ loop do
128
+ result = textract.get_expense_analysis(
129
+ job_id: job_id,
130
+ next_token: next_token
131
+ )
132
+
133
+ status = result.job_status
134
+ if status == "SUCCEEDED"
135
+ all_pages << result
136
+ next_token = result.next_token
137
+ break unless next_token
138
+ elsif %w[FAILED PARTIAL_SUCCESS].include?(status)
139
+ raise "Textract job ended with status: #{status}"
140
+ else
141
+ sleep 2 # still IN_PROGRESS; wait briefly and try again
142
+ end
143
+ end
144
+ all_pages
145
+ end
146
+
95
147
  def build_query(client)
96
148
  query = "mimeType != 'application/vnd.google-apps.folder'"
97
149
 
@@ -177,34 +229,58 @@ module Multiwoven::Integrations::Source
177
229
  client
178
230
  end
179
231
 
180
- def create_aws_connection
181
- region = ENV["AWS_REGION"]
232
+ # TODO: Refactor (extract) code for Amazon Textract
233
+ def create_aws_credentials
182
234
  access_key_id = ENV["AWS_ACCESS_KEY_ID"]
183
235
  secret_access_key = ENV["AWS_SECRET_ACCESS_KEY"]
184
- credentials = Aws::Credentials.new(access_key_id, secret_access_key)
236
+ Aws::Credentials.new(access_key_id, secret_access_key)
237
+ end
238
+
239
+ def create_aws_textract_connection
240
+ region = ENV["AWS_REGION"]
241
+ credentials = create_aws_credentials
185
242
  Aws::Textract::Client.new(region: region, credentials: credentials)
186
243
  end
187
244
 
245
+ def create_aws_s3_connection
246
+ region = ENV["AWS_REGION"]
247
+ credentials = create_aws_credentials
248
+ Aws::S3::Client.new(region: region, credentials: credentials)
249
+ end
250
+
188
251
  def extract_invoice_data(invoice, results)
189
- expense_document = results.expense_documents[0]
252
+ invoice = extract_summary_fields(invoice, results)
253
+ invoice = extract_line_items(invoice, results)
254
+ invoice["results"] = results.to_json if invoice.key?("results")
255
+ invoice.transform_keys(&:to_sym)
256
+ end
257
+
258
+ def extract_summary_fields(invoice, results)
259
+ document = results[0].expense_documents[0]
190
260
  (invoice.keys & TEXTRACT_SUMMARY_FIELDS.keys).each do |key|
191
- invoice[key] = extract_field_value(expense_document.summary_fields, TEXTRACT_SUMMARY_FIELDS[key])
261
+ invoice[key] = extract_field_value(document.summary_fields, TEXTRACT_SUMMARY_FIELDS[key])
192
262
  end
263
+ invoice
264
+ end
193
265
 
266
+ def extract_line_items(invoice, results)
194
267
  if invoice.key?("line_items")
195
- expense_document.line_item_groups.each do |line_item_group|
196
- line_item_group.line_items.each do |line_item|
197
- extracted_line_item = {}
198
- TEXTRACT_LINE_ITEMS_FIELDS.each_key do |key|
199
- extracted_line_item[key] = extract_field_value(line_item.line_item_expense_fields, TEXTRACT_LINE_ITEMS_FIELDS[key])
268
+ results.each do |result|
269
+ result.expense_documents.each do |expense_document|
270
+ expense_document.line_item_groups.each do |line_item_group|
271
+ line_item_group.line_items.each do |line_item|
272
+ extracted_line_item = {}
273
+ TEXTRACT_LINE_ITEMS_FIELDS.each_key do |key|
274
+ extracted_line_item[key] = extract_field_value(line_item.line_item_expense_fields, TEXTRACT_LINE_ITEMS_FIELDS[key])
275
+ end
276
+ invoice["line_items"] << extracted_line_item
277
+ end
200
278
  end
201
- invoice["line_items"] << extracted_line_item
202
279
  end
203
280
  end
204
281
  end
205
282
  invoice["line_items"] = invoice["line_items"].to_json
206
- invoice["results"] = results.to_json if invoice.key?("results")
207
- invoice.transform_keys(&:to_sym)
283
+ invoice
208
284
  end
209
285
 
210
286
  def extract_field_value(fields, selector)
@@ -46,6 +46,7 @@ require "googleauth"
46
46
  require "google/apis/drive_v3"
47
47
  require "aws-sdk-textract"
48
48
  require "jsonpath"
49
+ require "pdf/reader"
49
50
 
50
51
  # Service
51
52
  require_relative "integrations/config"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: multiwoven-integrations
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.34.2
4
+ version: 0.34.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Subin T P
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-09-20 00:00:00.000000000 Z
11
+ date: 2025-09-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport