multiwoven-integrations 0.34.2 → 0.34.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 26de49e95f5e42ee120a8ff08aa2e6690fac256496870b643db9afea71ff221c
|
4
|
+
data.tar.gz: b1af60ffa953977ae791a67d07931726b03cf58d637be8b8ca3a826760a94bff
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0c159fcc1b9bb3b28a66a0c071a0adb2998e68134752cc865b0e3f70e953732590719cb0996991c96c3c8fa038b0c03870e88b396b65c385b3b6c4387f58ee2a
|
7
|
+
data.tar.gz: 23a1b467a73c242cf8b00a209e95e6623765e0ef54a9406bfe2ccff51a599268e63adc0d0a0b3774a921035ef9a6f7bb3450c683f628e4e680b469386363f327
|
@@ -35,7 +35,7 @@ module Multiwoven::Integrations::Source
|
|
35
35
|
query = sync_config.model.query
|
36
36
|
query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
|
37
37
|
records = query(client, query)
|
38
|
-
|
38
|
+
process_files(client, records)
|
39
39
|
rescue StandardError => e
|
40
40
|
handle_exception(e, {
|
41
41
|
context: "GOOGLE_DRIVE:READ:EXCEPTION",
|
@@ -64,16 +64,24 @@ module Multiwoven::Integrations::Source
|
|
64
64
|
end
|
65
65
|
|
66
66
|
# Reads files from Google Drive and sends them to Amazon Textract for analysis
|
67
|
-
def
|
68
|
-
textract =
|
67
|
+
def process_files(client, records)
|
68
|
+
textract = create_aws_textract_connection
|
69
69
|
results = []
|
70
70
|
records.each do |record|
|
71
71
|
invoice = record.record.data
|
72
72
|
begin
|
73
|
-
|
74
|
-
client.get_file(invoice["id"], download_dest:
|
75
|
-
|
76
|
-
|
73
|
+
temp_file = Tempfile.new(invoice["file_name"])
|
74
|
+
client.get_file(invoice["id"], download_dest: temp_file.path)
|
75
|
+
|
76
|
+
reader = PDF::Reader.new(temp_file)
|
77
|
+
page_count = reader.page_count
|
78
|
+
|
79
|
+
analysis = if page_count > 1
|
80
|
+
start_expense_analysis(invoice["file_name"], temp_file)
|
81
|
+
else
|
82
|
+
[textract.analyze_expense(document: { bytes: File.binread(temp_file.path) })]
|
83
|
+
end
|
84
|
+
|
77
85
|
invoice = extract_invoice_data(invoice, analysis)
|
78
86
|
rescue Aws::Textract::Errors::UnsupportedDocumentException => e
|
79
87
|
invoice["exception"] = e.message if invoice.key?("exception")
|
@@ -92,6 +100,50 @@ module Multiwoven::Integrations::Source
|
|
92
100
|
results
|
93
101
|
end
|
94
102
|
|
103
|
+
def start_expense_analysis(file_name, temp_file)
|
104
|
+
bucket_name = ENV["TEXTRACT_BUCKET_NAME"]
|
105
|
+
s3_client = create_aws_s3_connection
|
106
|
+
textract = create_aws_textract_connection
|
107
|
+
|
108
|
+
s3_client.put_object(
|
109
|
+
bucket: bucket_name,
|
110
|
+
key: file_name,
|
111
|
+
body: temp_file
|
112
|
+
)
|
113
|
+
|
114
|
+
resp = textract.start_expense_analysis(
|
115
|
+
document_location: {
|
116
|
+
s3_object: {
|
117
|
+
bucket: bucket_name,
|
118
|
+
name: file_name
|
119
|
+
}
|
120
|
+
}
|
121
|
+
)
|
122
|
+
|
123
|
+
job_id = resp.job_id
|
124
|
+
all_pages = []
|
125
|
+
next_token = nil
|
126
|
+
|
127
|
+
loop do
|
128
|
+
result = textract.get_expense_analysis(
|
129
|
+
job_id: job_id,
|
130
|
+
next_token: next_token
|
131
|
+
)
|
132
|
+
|
133
|
+
status = result.job_status
|
134
|
+
if status == "SUCCEEDED"
|
135
|
+
all_pages << result
|
136
|
+
next_token = result.next_token
|
137
|
+
break unless next_token
|
138
|
+
elsif %w[FAILED PARTIAL_SUCCESS].include?(status)
|
139
|
+
raise "Textract job ended with status: #{status}"
|
140
|
+
else
|
141
|
+
sleep 2 # still IN_PROGRESS; wait briefly and try again
|
142
|
+
end
|
143
|
+
end
|
144
|
+
all_pages
|
145
|
+
end
|
146
|
+
|
95
147
|
def build_query(client)
|
96
148
|
query = "mimeType != 'application/vnd.google-apps.folder'"
|
97
149
|
|
@@ -177,34 +229,58 @@ module Multiwoven::Integrations::Source
|
|
177
229
|
client
|
178
230
|
end
|
179
231
|
|
180
|
-
|
181
|
-
|
232
|
+
# TODO: Refactor (extract) code for Amazon Textract
|
233
|
+
def create_aws_credentials
|
182
234
|
access_key_id = ENV["AWS_ACCESS_KEY_ID"]
|
183
235
|
secret_access_key = ENV["AWS_SECRET_ACCESS_KEY"]
|
184
|
-
|
236
|
+
Aws::Credentials.new(access_key_id, secret_access_key)
|
237
|
+
end
|
238
|
+
|
239
|
+
def create_aws_textract_connection
|
240
|
+
region = ENV["AWS_REGION"]
|
241
|
+
credentials = create_aws_credentials
|
185
242
|
Aws::Textract::Client.new(region: region, credentials: credentials)
|
186
243
|
end
|
187
244
|
|
245
|
+
def create_aws_s3_connection
|
246
|
+
region = ENV["AWS_REGION"]
|
247
|
+
credentials = create_aws_credentials
|
248
|
+
Aws::S3::Client.new(region: region, credentials: credentials)
|
249
|
+
end
|
250
|
+
|
188
251
|
def extract_invoice_data(invoice, results)
|
189
|
-
|
252
|
+
invoice = extract_summary_fields(invoice, results)
|
253
|
+
invoice = extract_line_items(invoice, results)
|
254
|
+
invoice["results"] = results.to_json if invoice.key?("results")
|
255
|
+
invoice.transform_keys(&:to_sym)
|
256
|
+
end
|
257
|
+
|
258
|
+
def extract_summary_fields(invoice, results)
|
259
|
+
document = results[0].expense_documents[0]
|
190
260
|
(invoice.keys & TEXTRACT_SUMMARY_FIELDS.keys).each do |key|
|
191
|
-
invoice[key] = extract_field_value(
|
261
|
+
invoice[key] = extract_field_value(document.summary_fields, TEXTRACT_SUMMARY_FIELDS[key])
|
192
262
|
end
|
263
|
+
invoice
|
264
|
+
end
|
193
265
|
|
266
|
+
def extract_line_items(invoice, results)
|
194
267
|
if invoice.key?("line_items")
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
268
|
+
results.each do |result|
|
269
|
+
result.expense_documents.each do |expense_document|
|
270
|
+
expense_document.line_item_groups.each do |line_item_group|
|
271
|
+
line_item_group.line_items.each do |line_item|
|
272
|
+
extracted_line_item = {}
|
273
|
+
TEXTRACT_LINE_ITEMS_FIELDS.each_key do |key|
|
274
|
+
extracted_line_item[key] = extract_field_value(line_item.line_item_expense_fields, TEXTRACT_LINE_ITEMS_FIELDS[key])
|
275
|
+
end
|
276
|
+
invoice["line_items"] << extracted_line_item
|
277
|
+
end
|
200
278
|
end
|
201
|
-
invoice["line_items"] << extracted_line_item
|
202
279
|
end
|
203
280
|
end
|
204
281
|
end
|
205
282
|
invoice["line_items"] = invoice["line_items"].to_json
|
206
|
-
invoice
|
207
|
-
invoice.transform_keys(&:to_sym)
|
283
|
+
invoice
|
208
284
|
end
|
209
285
|
|
210
286
|
def extract_field_value(fields, selector)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: multiwoven-integrations
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.34.
|
4
|
+
version: 0.34.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Subin T P
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-09-
|
11
|
+
date: 2025-09-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: activesupport
|