multiwoven-integrations 0.32.3 → 0.33.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: c8252062105b7103cfc2624d70ddf31a190c09fefc7313ddb80932bb20f28a5f
4
- data.tar.gz: a48c355b81c0edf217b9ab4280aa0dd5621a7eb41a00024d702c031175ff7248
3
+ metadata.gz: 6be57be6129595ce97d90e8ae2458d9acf46f263d6385e19ab8d5e54f89bcc3a
4
+ data.tar.gz: 6054e513e6cd67226c5f35e9ab5f85dfe39ad3a13cab12ed2e1f6f33c08b68be
5
5
  SHA512:
6
- metadata.gz: d38f8aaeb8bcb6ed802fcca41945370e171c60f81dfa058688ac877aaf5b94ecd522b3f0f903563c6f56347d66d653fcce940297db5a93a90fa95e9efeb15900
7
- data.tar.gz: 975dd4a8cc2b424f8cd7098c06886c5550899130d8c8dd8ad46de10166f34063f492b8f5e687f452d1776ff888d7104d95cb1fb653dc17e00a9feb3adde921df
6
+ metadata.gz: aa601444cd95bd8a8b6404ff5c352c4d44a45a5a4d50af0a28a3bcb0a2a90163e1ee84ffb71939e575086821a54d88159fef1cbec260c7037ff423132b0840e7
7
+ data.tar.gz: b70732578f3dc563519db07e7ecb10df0022281e781099820a90f585791ca786c8eb23523fa93d2417acb1a51657e0f021b326822b5ca9845569ba57c288bef4
@@ -98,6 +98,22 @@ module Multiwoven
98
98
  FIRECRAWL_CRAWL_ACTIVE_URL = "https://api.firecrawl.dev/v1/crawl/active"
99
99
  FIRECRAWL_GET_CRAWL_URL = "https://api.firecrawl.dev/v1/crawl/%<id>s"
100
100
  FIRECRAWL_REQUEST_RATE_LIMIT = 5
101
+
102
+ # Amazon Textract (analyze_expense) fields
103
+ TEXTRACT_SUMMARY_FIELDS = {
104
+ "invoice_number" => "INVOICE_RECEIPT_ID",
105
+ "invoice_date" => "INVOICE_RECEIPT_DATE",
106
+ "purchase_order" => "PO_NUMBER",
107
+ "invoice_total" => "TOTAL",
108
+ "vendor_name" => "VENDOR_NAME"
109
+ }.freeze
110
+ TEXTRACT_LINE_ITEMS_FIELDS = {
111
+ "item_number" => "PRODUCT_CODE",
112
+ "item_description" => "ITEM",
113
+ "item_quantity" => "QUANTITY",
114
+ "item_price" => "UNIT_PRICE",
115
+ "line_total" => "PRICE"
116
+ }.freeze
101
117
  end
102
118
  end
103
119
  end
@@ -2,7 +2,7 @@
2
2
 
3
3
  module Multiwoven
4
4
  module Integrations
5
- VERSION = "0.32.3"
5
+ VERSION = "0.33.0"
6
6
 
7
7
  ENABLED_SOURCES = %w[
8
8
  Snowflake
@@ -32,6 +32,7 @@ module Multiwoven
32
32
  Qdrant
33
33
  Firecrawl
34
34
  Odoo
35
+ GoogleDrive
35
36
  ].freeze
36
37
 
37
38
  ENABLED_DESTINATIONS = %w[
@@ -0,0 +1,215 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Multiwoven::Integrations::Source
4
+ module GoogleDrive
5
+ include Multiwoven::Integrations::Core
6
+
7
+ FIELDS = "files(id, name, parents, mimeType), nextPageToken"
8
+ MAX_PER_PAGE = 1000
9
+ MIMETYPE_GOOGLE_DRIVE_FOLDER = "mimeType = 'application/vnd.google-apps.folder'"
10
+
11
+ class Client < SourceConnector
12
+ def check_connection(connection_config)
13
+ connection_config = connection_config.with_indifferent_access
14
+ client = create_connection(connection_config)
15
+ build_query(client)
16
+ success_status
17
+ rescue StandardError => e
18
+ failure_status(e)
19
+ end
20
+
21
+ def discover(_connection_config)
22
+ catalog_json = read_json(CATALOG_SPEC_PATH)
23
+ catalog = build_catalog(catalog_json)
24
+ catalog.to_multiwoven_message
25
+ rescue StandardError => e
26
+ handle_exception(e, {
27
+ context: "GOOGLE_DRIVE:DISCOVER:EXCEPTION",
28
+ type: "error"
29
+ })
30
+ end
31
+
32
+ def read(sync_config)
33
+ connection_config = sync_config.source.connection_specification.with_indifferent_access
34
+ client = create_connection(connection_config)
35
+ query = sync_config.model.query
36
+ query = batched_query(query, sync_config.limit, sync_config.offset) unless sync_config.limit.nil? && sync_config.offset.nil?
37
+ records = query(client, query)
38
+ analyze_expenses(client, records)
39
+ rescue StandardError => e
40
+ handle_exception(e, {
41
+ context: "GOOGLE_DRIVE:READ:EXCEPTION",
42
+ type: "error",
43
+ sync_id: sync_config.sync_id,
44
+ sync_run_id: sync_config.sync_run_id
45
+ })
46
+ end
47
+
48
+ private
49
+
50
+ def query(client, query)
51
+ limit = 0
52
+ offset = 0
53
+ query = query.gsub("\n", " ").gsub(/\s+/, " ")
54
+ limit = query.match(/LIMIT (\d+)/)[1].to_i if query.include? "LIMIT"
55
+ offset = query.match(/OFFSET (\d+)/)[1].to_i if query.include? "OFFSET"
56
+ query = query.match(/\((.*)\) AS/)[1] if query.include? "AS subquery"
57
+ columns = select_columns(query)
58
+
59
+ google_drive_query = build_query(client)
60
+ files = get_files(client, google_drive_query, limit, offset)
61
+ files.map do |file|
62
+ RecordMessage.new(data: prepare_invoice(file, columns), emitted_at: Time.now.to_i).to_multiwoven_message
63
+ end
64
+ end
65
+
66
+ # Reads files from Google Drive and sends them to Amazon Textract for analysis
67
+ def analyze_expenses(client, records)
68
+ textract = create_aws_connection
69
+ results = []
70
+ records.each do |record|
71
+ invoice = record.record.data
72
+ begin
73
+ byte_stream = StringIO.new
74
+ client.get_file(invoice["id"], download_dest: byte_stream)
75
+ byte_stream.rewind
76
+ analysis = textract.analyze_expense(document: { bytes: byte_stream.read })
77
+ invoice = extract_invoice_data(invoice, analysis)
78
+ rescue Aws::Textract::Errors::UnsupportedDocumentException => e
79
+ invoice[:exception] = "Document format not supported." if invoice.key?(:exception)
80
+ handle_exception(e, {
81
+ context: "GOOGLE_DRIVE:READ:EXTRACT:EXCEPTION",
82
+ type: "error"
83
+ })
84
+ rescue StandardError => e
85
+ invoice[:exception] = e.message if invoice.key?(:exception)
86
+ handle_exception(e, {
87
+ context: "GOOGLE_DRIVE:READ:EXTRACT:EXCEPTION",
88
+ type: "error"
89
+ })
90
+ end
91
+ results.append(RecordMessage.new(data: invoice, emitted_at: Time.now.to_i).to_multiwoven_message)
92
+ end
93
+ results
94
+ end
95
+
96
+ def build_query(client)
97
+ query = "mimeType != 'application/vnd.google-apps.folder'"
98
+
99
+ if @options[:folder]
100
+ folder_query = "#{MIMETYPE_GOOGLE_DRIVE_FOLDER} and (name = '#{@options[:folder]}')"
101
+ response = client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: folder_query, fields: FIELDS)
102
+ raise "Specified folder does not exist" if response.files.empty?
103
+
104
+ parent_id = response.files.first.id
105
+ parents_query = "'#{parent_id}' in parents"
106
+ end
107
+
108
+ if @options[:subfolders]
109
+ subfolders_query = MIMETYPE_GOOGLE_DRIVE_FOLDER
110
+ subfolders_query += "and #{parents_query}" if parents_query
111
+ response = client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: subfolders_query, fields: FIELDS)
112
+ subfolders_ids = response.files.map { |file| "'#{file.id}'" }
113
+ parents_query = "(#{subfolders_ids.join(" in parents or ")} in parents)"
114
+ end
115
+
116
+ query += " and mimeType = '#{@options[:file_type]}'" if @options[:file_type]
117
+ query += " and #{parents_query}" if parents_query
118
+ query
119
+ end
120
+
121
+ def get_files(client, query, limit, offset)
122
+ total_fetched = 0
123
+ result = []
124
+
125
+ return result if offset.positive? && !@next_page_token
126
+
127
+ while total_fetched < limit
128
+ batch_limit = [MAX_PER_PAGE, limit - total_fetched].min
129
+ response = if @next_page_token
130
+ client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: query, fields: FIELDS, page_size: batch_limit, page_token: @next_page_token)
131
+ else
132
+ client.list_files(include_items_from_all_drives: true, supports_all_drives: true, q: query, fields: FIELDS, page_size: batch_limit)
133
+ end
134
+ break if response.files.empty?
135
+
136
+ result.push(*response.files)
137
+ @next_page_token = response.next_page_token
138
+ break unless response.next_page_token
139
+
140
+ total_fetched += response.files.size
141
+ end
142
+
143
+ result
144
+ end
145
+
146
+ def select_columns(query)
147
+ columns = query.match(/SELECT (.*) FROM/)[1]
148
+ all_columns = %w[line_items id file_name] + TEXTRACT_SUMMARY_FIELDS.keys
149
+ @options[:fields] = all_columns if @options[:fields].empty?
150
+
151
+ return @options[:fields] if columns.include?("*")
152
+
153
+ columns = columns.split(",").map(&:strip)
154
+ raise "Column(s) #{(columns - all_columns).join(", ")} not valid." if (columns - all_columns).length.positive?
155
+
156
+ columns & all_columns
157
+ end
158
+
159
+ def prepare_invoice(file, columns)
160
+ invoice = {}
161
+ columns.each { |column| invoice[column] = "" if TEXTRACT_SUMMARY_FIELDS.key?(column) }
162
+ invoice["line_items"] = [] if columns.any?("line_items")
163
+ invoice["id"] = file.id if columns.any?("id")
164
+ invoice["file_name"] = file.id if columns.any?("file_name")
165
+ invoice["exception"] = "" if columns.any?("exception")
166
+ invoice
167
+ end
168
+
169
+ def create_connection(connection_config)
170
+ @options = connection_config[:options]
171
+ credentials = connection_config[:credentials_json]
172
+ client = Google::Apis::DriveV3::DriveService.new
173
+ client.authorization = Google::Auth::ServiceAccountCredentials.make_creds(
174
+ json_key_io: StringIO.new(credentials.to_json),
175
+ scope: GOOGLE_SHEETS_SCOPE
176
+ )
177
+ client
178
+ end
179
+
180
+ def create_aws_connection
181
+ region = ENV["AWS_REGION"]
182
+ access_key_id = ENV["AWS_ACCESS_KEY_ID"]
183
+ secret_access_key = ENV["AWS_SECRET_ACCESS_KEY"]
184
+ credentials = Aws::Credentials.new(access_key_id, secret_access_key)
185
+ Aws::Textract::Client.new(region: region, credentials: credentials)
186
+ end
187
+
188
+ def extract_invoice_data(invoice, results)
189
+ expense_document = results.expense_documents[0]
190
+ (invoice.keys & TEXTRACT_SUMMARY_FIELDS.keys).each do |key|
191
+ invoice[key] = extract_field_value(expense_document.summary_fields, TEXTRACT_SUMMARY_FIELDS[key])
192
+ end
193
+
194
+ if invoice.key?("line_items")
195
+ expense_document.line_item_groups.each do |line_item_group|
196
+ line_item_group.line_items.each do |line_item|
197
+ extracted_line_item = {}
198
+ TEXTRACT_LINE_ITEMS_FIELDS.each_key do |key|
199
+ extracted_line_item[key] = extract_field_value(line_item.line_item_expense_fields, TEXTRACT_LINE_ITEMS_FIELDS[key])
200
+ end
201
+ invoice["line_items"] << extracted_line_item
202
+ end
203
+ end
204
+ end
205
+ invoice["line_items"] = invoice["line_items"].to_json
206
+ invoice.transform_keys(&:to_sym)
207
+ end
208
+
209
+ def extract_field_value(fields, selector)
210
+ selected_field = fields.select { |field| field.type.text == selector }.first
211
+ selected_field ? selected_field.value_detection.text : ""
212
+ end
213
+ end
214
+ end
215
+ end
@@ -0,0 +1,63 @@
1
+ {
2
+ "request_rate_limit": 6000,
3
+ "request_rate_limit_unit": "minute",
4
+ "request_rate_concurrency": 10,
5
+ "streams": [
6
+ {
7
+ "name": "invoices",
8
+ "action": "fetch",
9
+ "json_schema": {
10
+ "type": "object",
11
+ "properties": {
12
+ "id": {
13
+ "type": "string"
14
+ },
15
+ "vendor_name": {
16
+ "type": "string"
17
+ },
18
+ "file_name": {
19
+ "type": "string"
20
+ },
21
+ "exception": {
22
+ "type": "string"
23
+ },
24
+ "invoice_number": {
25
+ "type": "string"
26
+ },
27
+ "purchase_order": {
28
+ "type": "string"
29
+ },
30
+ "invoice_date": {
31
+ "type": "string"
32
+ },
33
+ "invoice_total": {
34
+ "type": "string"
35
+ },
36
+ "line_items": {
37
+ "type": "array",
38
+ "items": {
39
+ "type": "object",
40
+ "properties": {
41
+ "item_number": {
42
+ "type": "string"
43
+ },
44
+ "item_description": {
45
+ "type": "string"
46
+ },
47
+ "item_quantity": {
48
+ "type": "string"
49
+ },
50
+ "item_price": {
51
+ "type": "string"
52
+ },
53
+ "line_total": {
54
+ "type": "string"
55
+ }
56
+ }
57
+ }
58
+ }
59
+ }
60
+ }
61
+ }
62
+ ]
63
+ }
@@ -0,0 +1,15 @@
1
+ {
2
+ "data": {
3
+ "name": "GoogleDrive",
4
+ "title": "Google Drive",
5
+ "connector_type": "source",
6
+ "category": "File Storage",
7
+ "documentation_url": "https://docs.squared.ai/guides/sources/data-sources/google_drive",
8
+ "github_issue_label": "source-google-drive",
9
+ "icon": "icon.svg",
10
+ "license": "MIT",
11
+ "release_stage": "alpha",
12
+ "support_level": "community",
13
+ "tags": ["language:ruby", "multiwoven"]
14
+ }
15
+ }
@@ -0,0 +1,120 @@
1
+ {
2
+ "documentation_url": "https://docs.squared.ai/guides/sources/data-sources/google_drive",
3
+ "stream_type": "dynamic",
4
+ "connector_query_type": "raw_sql",
5
+ "connection_specification": {
6
+ "$schema": "http://json-schema.org/draft-07/schema#",
7
+ "title": "GoogleDrive",
8
+ "type": "object",
9
+ "required": ["credentials_json"],
10
+ "properties": {
11
+ "credentials_json": {
12
+ "type": "object",
13
+ "description": "You can get the keys from the Google Cloud web console. First, go to the IAM page and select Service Accounts from the left menu. Next, locate your service account in the list, click on its Keys tab, and then click Add Key. Lastly, click Create new key and select JSON.",
14
+ "title": "",
15
+ "properties": {
16
+ "type": {
17
+ "type": "string",
18
+ "enum": ["service_account"]
19
+ },
20
+ "project_id": {
21
+ "type": "string"
22
+ },
23
+ "private_key_id": {
24
+ "type": "string",
25
+ "multiwoven_secret": true
26
+ },
27
+ "private_key": {
28
+ "type": "string",
29
+ "multiwoven_secret": true
30
+ },
31
+ "client_email": {
32
+ "type": "string",
33
+ "format": "email"
34
+ },
35
+ "client_id": {
36
+ "type": "string",
37
+ "multiwoven_secret": true
38
+ },
39
+ "auth_uri": {
40
+ "type": "string"
41
+ },
42
+ "token_uri": {
43
+ "type": "string"
44
+ },
45
+ "auth_provider_x509_cert_url": {
46
+ "type": "string"
47
+ },
48
+ "client_x509_cert_url": {
49
+ "type": "string",
50
+ "format": "uri"
51
+ },
52
+ "universe_domain": {
53
+ "type": "string"
54
+ }
55
+ },
56
+ "required": [
57
+ "type",
58
+ "project_id",
59
+ "private_key_id",
60
+ "private_key",
61
+ "client_email",
62
+ "client_id",
63
+ "auth_uri",
64
+ "token_uri",
65
+ "auth_provider_x509_cert_url",
66
+ "client_x509_cert_url",
67
+ "universe_domain"
68
+ ]
69
+ },
70
+ "options": {
71
+ "type": "object",
72
+ "description": "When the subfolders option is set to true, files will be read from the subfolders of the root or specified folder.",
73
+ "title": "Options",
74
+ "required": ["document_type"],
75
+ "properties": {
76
+ "folder": {
77
+ "type": "string",
78
+ "description": "When specified, reads files contained in the folder.",
79
+ "title": "Folder"
80
+ },
81
+ "subfolders": {
82
+ "type": "boolean",
83
+ "default": false,
84
+ "title": "Read from subfolders"
85
+ },
86
+ "file_type": {
87
+ "description": "The type of file to read",
88
+ "type": "string",
89
+ "title": "File Type",
90
+ "enum": [
91
+ "application/pdf"
92
+ ]
93
+ },
94
+ "document_type": {
95
+ "type": "string",
96
+ "enum": [ "invoices" ]
97
+ },
98
+ "fields": {
99
+ "type": "array",
100
+ "description": "Leave blank to extract all fields.",
101
+ "items": {
102
+ "type": "string",
103
+ "anyOf": [
104
+ { "const": "id", "title": "File Id" },
105
+ { "const": "file_name", "title": "File Name" },
106
+ { "const": "exception", "title": "Parsing Exception" },
107
+ { "const": "invoice_number", "title": "Invoice/Receipt Number" },
108
+ { "const": "invoice_date", "title": "Invoice/Receipt Date" },
109
+ { "const": "invoice_total", "title": "Invoice/Receipt Total" },
110
+ { "const": "purchase_order", "title": "Purchase Order Number" },
111
+ { "const": "line_items", "title": "Invoice Line Items (Code, Description, Quantity, Unit Price, Total Price)"}
112
+ ]
113
+ },
114
+ "uniqueItems": true
115
+ }
116
+ }
117
+ }
118
+ }
119
+ }
120
+ }
@@ -0,0 +1,8 @@
1
+ <svg viewBox="0 0 87.3 78" xmlns="http://www.w3.org/2000/svg">
2
+ <path d="m6.6 66.85 3.85 6.65c.8 1.4 1.95 2.5 3.3 3.3l13.75-23.8h-27.5c0 1.55.4 3.1 1.2 4.5z" fill="#0066da"/>
3
+ <path d="m43.65 25-13.75-23.8c-1.35.8-2.5 1.9-3.3 3.3l-25.4 44a9.06 9.06 0 0 0 -1.2 4.5h27.5z" fill="#00ac47"/>
4
+ <path d="m73.55 76.8c1.35-.8 2.5-1.9 3.3-3.3l1.6-2.75 7.65-13.25c.8-1.4 1.2-2.95 1.2-4.5h-27.502l5.852 11.5z" fill="#ea4335"/>
5
+ <path d="m43.65 25 13.75-23.8c-1.35-.8-2.9-1.2-4.5-1.2h-18.5c-1.6 0-3.15.45-4.5 1.2z" fill="#00832d"/>
6
+ <path d="m59.8 53h-32.3l-13.75 23.8c1.35.8 2.9 1.2 4.5 1.2h50.8c1.6 0 3.15-.45 4.5-1.2z" fill="#2684fc"/>
7
+ <path d="m73.4 26.5-12.7-22c-.8-1.4-1.95-2.5-3.3-3.3l-13.75 23.8 16.15 28h27.45c0-1.55-.4-3.1-1.2-4.5z" fill="#ffba00"/>
8
+ </svg>
@@ -42,6 +42,9 @@ require "pinecone"
42
42
  require "intuit-oauth"
43
43
  require "nokogiri"
44
44
  require "xmlrpc/client"
45
+ require "googleauth"
46
+ require "google/apis/drive_v3"
47
+ require "aws-sdk-textract"
45
48
 
46
49
  # Service
47
50
  require_relative "integrations/config"
@@ -92,6 +95,7 @@ require_relative "integrations/source/pinecone_db/client"
92
95
  require_relative "integrations/source/qdrant/client"
93
96
  require_relative "integrations/source/firecrawl/client"
94
97
  require_relative "integrations/source/odoo/client"
98
+ require_relative "integrations/source/google_drive/client"
95
99
 
96
100
  # Destination
97
101
  require_relative "integrations/destination/klaviyo/client"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: multiwoven-integrations
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.32.3
4
+ version: 0.33.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Subin T P
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2025-07-28 00:00:00.000000000 Z
11
+ date: 2025-07-31 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -765,6 +765,11 @@ files:
765
765
  - lib/multiwoven/integrations/source/generic_open_ai/config/meta.json
766
766
  - lib/multiwoven/integrations/source/generic_open_ai/config/spec.json
767
767
  - lib/multiwoven/integrations/source/generic_open_ai/icon.svg
768
+ - lib/multiwoven/integrations/source/google_drive/client.rb
769
+ - lib/multiwoven/integrations/source/google_drive/config/catalog.json
770
+ - lib/multiwoven/integrations/source/google_drive/config/meta.json
771
+ - lib/multiwoven/integrations/source/google_drive/config/spec.json
772
+ - lib/multiwoven/integrations/source/google_drive/icon.svg
768
773
  - lib/multiwoven/integrations/source/google_vertex_model/client.rb
769
774
  - lib/multiwoven/integrations/source/google_vertex_model/config/catalog.json
770
775
  - lib/multiwoven/integrations/source/google_vertex_model/config/meta.json