connectors_sdk 8.3.0.0.pre.20220414T060419Z

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,153 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/adapter'
10
+
11
+ module ConnectorsSdk
12
+ module Office365
13
+ class Adapter < ConnectorsSdk::Base::Adapter
14
+ def self.swiftype_document_from_file(_file)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def self.swiftype_document_from_folder(_folder)
19
+ raise NotImplementedError
20
+ end
21
+
22
+ class GraphItem
23
+ attr_reader :item
24
+
25
+ def initialize(item)
26
+ @item = item
27
+ end
28
+
29
+ def self.convert_id_to_fp_id(_id)
30
+ raise NotImplementedError
31
+ end
32
+
33
+ def self.get_path(item)
34
+ parent_reference_path = item.parentReference&.path || ''
35
+ parent_folder_path =
36
+ if parent_reference_path.end_with?('root:')
37
+ ''
38
+ else
39
+ CGI.unescape(parent_reference_path).split('root:').last
40
+ end
41
+ ConnectorsSdk::Office365::Adapter.normalize_path("#{parent_folder_path}/#{item.name}")
42
+ end
43
+
44
+ def to_swiftype_document
45
+ {
46
+ :_fields_to_preserve => ConnectorsSdk::Office365::Adapter.fields_to_preserve,
47
+ :id => self.class.convert_id_to_fp_id(item.id),
48
+ :path => get_path(item),
49
+ :title => item.name,
50
+ :url => item.webUrl,
51
+ :type => ConnectorsSdk::Base::Adapter.normalize_enum(type),
52
+ :created_by => created_by(item),
53
+ :created_at => ConnectorsSdk::Base::Adapter.normalize_date(item.createdDateTime),
54
+ :last_updated => ConnectorsSdk::Base::Adapter.normalize_date(item.lastModifiedDateTime),
55
+ :updated_by => last_modified_by(item),
56
+ :drive_owner => item.drive_owner_name
57
+ }.merge(fields).merge(permissions)
58
+ end
59
+
60
+ private
61
+
62
+ def get_path(item)
63
+ ConnectorsSdk::Office365::Adapter::GraphItem.get_path(item)
64
+ end
65
+
66
+ def type
67
+ raise NotImplementedError
68
+ end
69
+
70
+ def fields
71
+ raise NotImplementedError
72
+ end
73
+
74
+ def created_by(item)
75
+ item.createdBy&.user&.displayName
76
+ end
77
+
78
+ def last_modified_by(item)
79
+ item.lastModifiedBy&.user&.displayName
80
+ end
81
+
82
+ def permissions
83
+ if item.permissions.present?
84
+ {
85
+ ConnectorsShared::Constants::ALLOW_FIELD => item.permissions.map do |next_permission|
86
+ [
87
+ next_permission.dig(:grantedTo, :user, :id),
88
+ next_permission.dig(:grantedTo, :user, :displayName)
89
+ ].compact
90
+ end.flatten.uniq
91
+ }
92
+ else
93
+ {}
94
+ end
95
+ end
96
+ end
97
+
98
+ class FileGraphItem < GraphItem
99
+ def self.convert_id_to_fp_id(_id)
100
+ raise NotImplementedError
101
+ end
102
+
103
+ private
104
+
105
+ def type
106
+ 'file'
107
+ end
108
+
109
+ def fields
110
+ # FIXME: potentially add `updated_by_email`
111
+ {
112
+ :title => ConnectorsSdk::Base::Adapter.strip_file_extension(item.name),
113
+ :mime_type => ConnectorsSdk::Base::Adapter.mime_type_for_file(item.name),
114
+ :extension => ConnectorsSdk::Base::Adapter.extension_for_file(item.name)
115
+ }
116
+ end
117
+ end
118
+
119
+ class FolderGraphItem < GraphItem
120
+
121
+ private
122
+
123
+ def type
124
+ 'folder'
125
+ end
126
+
127
+ def fields
128
+ {
129
+ :title => item.root ? item.drive_name : item.name
130
+ }
131
+ end
132
+ end
133
+
134
+ class PackageGraphItem < GraphItem
135
+ def self.convert_id_to_fp_id(id)
136
+ raise NotImplementedError
137
+ end
138
+
139
+ private
140
+
141
+ def type
142
+ # MSFT gives packages as 'oneNote' and it should be called 'OneNote'
143
+ item.package.type.classify
144
+ end
145
+
146
+ def fields
147
+ {}
148
+ end
149
+
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,37 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/config'
10
+
11
+ module ConnectorsSdk
12
+ module Office365
13
+ class Config < ConnectorsSdk::Base::Config
14
+ ALL_DRIVE_IDS = 'all'.freeze
15
+
16
+ attr_reader :drive_ids, :index_permissions
17
+
18
+ def initialize(drive_ids:, cursors:, index_permissions: false)
19
+ super(:cursors => cursors)
20
+ @cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY] ||= {}
21
+ @drive_ids = drive_ids
22
+ @index_permissions = index_permissions
23
+ end
24
+
25
+ def index_all_drives?
26
+ drive_ids == ALL_DRIVE_IDS
27
+ end
28
+
29
+ def to_h
30
+ super.merge(
31
+ :drive_ids => drive_ids,
32
+ :index_permissions => index_permissions
33
+ )
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,319 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/custom_client'
10
+ require 'connectors_shared'
11
+ require 'hashie/mash'
12
+
13
+ module ConnectorsSdk
14
+ module Office365
15
+ class CustomClient < ConnectorsSdk::Base::CustomClient
16
+
17
+ OFFICE365_PERMISSION_SYNC_TIME_SLA = 24.hours
18
+
19
+ class ClientError < ConnectorsShared::ClientError
20
+ attr_reader :status_code, :endpoint
21
+
22
+ def initialize(status_code, endpoint)
23
+ @status_code = status_code
24
+ @endpoint = endpoint
25
+ end
26
+ end
27
+
28
+ class Office365InvalidCursorsError < ClientError; end
29
+
30
+ # This is necessary because `Faraday::NestedParamsEncoder.encode` changes the
31
+ # order of params, which Microsoft's download API can't handle for some reason.
32
+ module Office365DownloadParamsEncoder
33
+ class << self
34
+ extend Forwardable
35
+ def_delegators :'Faraday::NestedParamsEncoder', :escape, :decode
36
+
37
+ def encode(params)
38
+ params.map do |key, value|
39
+ "#{escape(key)}=#{escape(value)}"
40
+ end.join('&')
41
+ end
42
+ end
43
+ end
44
+
45
+ attr_reader :access_token
46
+ attr_accessor :cursors
47
+
48
+ BASE_URL = 'https://graph.microsoft.com/v1.0/'.freeze
49
+
50
+ def initialize(access_token:, cursors: {}, ensure_fresh_auth: nil)
51
+ @access_token = access_token
52
+ @cursors = cursors || {}
53
+ @cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY] ||= {}
54
+ super(:ensure_fresh_auth => ensure_fresh_auth)
55
+ end
56
+
57
+ def update_auth_data!(new_access_token)
58
+ @access_token = new_access_token
59
+ self
60
+ end
61
+
62
+ def me
63
+ request_endpoint(:endpoint => 'me')
64
+ end
65
+
66
+ def one_drive_drives(fields: [])
67
+ query_params = transform_fields_to_request_query_params(fields)
68
+ response = request_endpoint(:endpoint => 'me/drives/', :query_params => query_params)
69
+ response.value
70
+ end
71
+
72
+ def share_point_drives(fields: [])
73
+ # When new Private Team site is created in SharePoint, permissions take some time to propagate, therefore
74
+ # this site won't be indexed by us until propagation happens. This code tries to also fetch sites from
75
+ # recently created groups (new Private Team site will be there) to reduce friction and index this site
76
+ # earlier.
77
+ # See: https://github.com/elastic/ent-search/pull/3581
78
+ share_point_sites = (sites(:fields => %w[id]) + recent_share_point_group_sites(:fields => %[id]))
79
+
80
+ share_point_sites
81
+ .map(&:id)
82
+ .uniq
83
+ .map { |site_id| site_drives(site_id, :fields => fields) }
84
+ .flatten
85
+ .compact
86
+ end
87
+
88
+ def groups(fields: [])
89
+ request_all(:endpoint => 'groups/', :fields => fields)
90
+ end
91
+
92
+ def group_root_site(group_id, fields: [])
93
+ query_params = transform_fields_to_request_query_params(fields)
94
+
95
+ request_endpoint(:endpoint => "groups/#{group_id}/sites/root", :query_params => query_params)
96
+ end
97
+
98
+ def sites(fields: [])
99
+ # This empty search string ends up returning all sites. If we leave it off, the API returns a 400
100
+ # I explicity set the page size here (via :top) because otherwise the API just returns the first ten and
101
+ # does not provide any additional pages.
102
+ request_all(:endpoint => 'sites/', :fields => fields, :additional_query_params => { :search => '', :top => 10 })
103
+ end
104
+
105
+ def site_drives(site_id, fields: [])
106
+ document_libraries(
107
+ request_all(:endpoint => "sites/#{site_id}/drives/", :fields => fields)
108
+ )
109
+ rescue ClientError => e
110
+ ConnectorsShared::Logger.info("Received response of #{e.status_code} trying to get drive for Site with Id = #{site_id}: #{e.message}")
111
+ nil
112
+ end
113
+
114
+ def list_items(drive_id, fields: [], break_after_page: false)
115
+ # MSFT Graph API does not have a recursive list items, have to do this dfs style
116
+
117
+ stack = if break_after_page && cursors['page_cursor'].present?
118
+ cursors.delete('page_cursor')
119
+ else
120
+ [get_root_item(drive_id, ['id']).id]
121
+ end
122
+
123
+ # We rely on the id field below to perform our DFS
124
+ fields_with_id = fields.any? ? fields | ['id'] : fields
125
+ yielded = 0
126
+ while stack.any?
127
+ folder_id = stack.pop
128
+ item_children(drive_id, folder_id, :fields => fields_with_id) do |item|
129
+ if item.folder
130
+ stack << item.id
131
+ end
132
+ yield item
133
+
134
+ yielded += 1
135
+ end
136
+
137
+ if break_after_page && yielded >= 100 && stack.any?
138
+ cursors['page_cursor'] = stack.dup
139
+ break
140
+ end
141
+ end
142
+ end
143
+
144
+ def item_permissions(drive_id, item_id)
145
+ request_endpoint(:endpoint => "drives/#{drive_id}/items/#{item_id}/permissions").value
146
+ end
147
+
148
+ def list_changes(drive_id:, start_delta_link: nil, last_modified: nil, break_after_page: false)
149
+ query_params = { :'$select' => %w(id content.downloadUrl lastModifiedDateTime lastModifiedBy root deleted file folder package name webUrl createdBy createdDateTime size).join(',') }
150
+ response =
151
+ if break_after_page && cursors['page_cursor'].present?
152
+ request_json(:url => cursors.delete('page_cursor'))
153
+ elsif start_delta_link.nil?
154
+ endpoint = "drives/#{drive_id}/root/delta"
155
+ request_endpoint(:endpoint => endpoint, :query_params => query_params)
156
+ else
157
+ request_json(:url => start_delta_link, :query_params => query_params)
158
+ end
159
+
160
+ yielded = 0
161
+ loop do
162
+ response.value.each do |change|
163
+ # MSFT Graph API does not allow us to view "changes" in chronological order, so if there is no cursor,
164
+ # we have to iterate through all changes and cherry-pick the ones that are past the `last_modified` Time
165
+ # since to get another cursor, we would have to go through all the changes anyway
166
+ next if last_modified.present? && Time.parse(change.lastModifiedDateTime) < last_modified
167
+ next if change.root # We don't want to index the root of the drive
168
+
169
+ yield change
170
+ yielded += 1
171
+ end
172
+
173
+ if break_after_page && yielded >= 100 && response['@odata.nextLink'].present?
174
+ cursors['page_cursor'] = response['@odata.nextLink']
175
+ break
176
+ end
177
+
178
+ break if response['@odata.nextLink'].nil?
179
+ response = request_json(:url => response['@odata.nextLink'])
180
+ end
181
+
182
+ cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] = response['@odata.deltaLink']
183
+ end
184
+
185
+ def get_latest_delta_link(drive_id)
186
+ cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] || exhaustively_get_delta_link(drive_id)
187
+ end
188
+
189
+ def exhaustively_get_delta_link(drive_id)
190
+ endpoint = "drives/#{drive_id}/root/delta"
191
+
192
+ Connectors::Stats.measure('custom_client.office365.exhaustively_get_delta_link') do
193
+ response = request_endpoint(:endpoint => endpoint, :query_params => { :'$select' => 'id' })
194
+
195
+ while next_link = response['@odata.nextLink']
196
+ response = request_json(:url => next_link)
197
+ end
198
+
199
+ response['@odata.deltaLink'].split('?').first
200
+ end
201
+ end
202
+
203
+ def download_item(download_url)
204
+ request(:url => download_url) do |request|
205
+ request.options.params_encoder = Office365DownloadParamsEncoder
206
+ end.body
207
+ end
208
+
209
+ def user_groups(user_id, fields = [])
210
+ (
211
+ request_all(
212
+ :endpoint => "users/#{user_id}/transitiveMemberOf",
213
+ :fields => fields
214
+ ) +
215
+ request_all(
216
+ :endpoint => "users/#{user_id}/ownedObjects",
217
+ :fields => fields
218
+ ).select { |next_object| next_object['@odata.type'] == '#microsoft.graph.group' }
219
+ ).uniq
220
+ end
221
+
222
+ private
223
+
224
+ def recent_share_point_group_sites(fields: [])
225
+ # group.createdDateTime field is UTC as stated in documentation:
226
+ # https://docs.microsoft.com/en-us/graph/api/resources/group?view=graph-rest-1.0#properties
227
+ created_date_time_threshold = Time.now.utc - OFFICE365_PERMISSION_SYNC_TIME_SLA
228
+
229
+ groups(:fields => %w(id createdDateTime))
230
+ .select { |group| group.createdDateTime > created_date_time_threshold }
231
+ .map { |group| group_root_site(group.id, :fields => %w[id]) }.compact
232
+ end
233
+
234
+ def document_libraries(drives)
235
+ drives.select { |drive| drive.driveType == 'documentLibrary' }
236
+ end
237
+
238
+ def transform_fields_to_request_query_params(fields = [])
239
+ fields.empty? ? {} : { :'$select' => fields.join(',') }
240
+ end
241
+
242
+ def request_all(endpoint:, fields: [], additional_query_params: {})
243
+ query_params = transform_fields_to_request_query_params(fields)
244
+ response = request_endpoint(:endpoint => endpoint, :query_params => query_params.merge(additional_query_params))
245
+
246
+ items = response.value
247
+ while next_link = response['@odata.nextLink']
248
+ response = request_json(:url => next_link)
249
+ items.concat(response.value)
250
+ end
251
+ items
252
+ end
253
+
254
+ def get_root_item(drive_id, fields = [])
255
+ query_params = transform_fields_to_request_query_params(fields)
256
+ request_endpoint(:endpoint => "drives/#{drive_id}/root", :query_params => query_params)
257
+ end
258
+
259
+ def item_children(drive_id, item_id, fields: [], &block)
260
+ endpoint = "drives/#{drive_id}/items/#{item_id}/children"
261
+ query_params = transform_fields_to_request_query_params(fields)
262
+ response = request_endpoint(:endpoint => endpoint, :query_params => query_params)
263
+
264
+ loop do
265
+ response.value.each(&block)
266
+ next_link = response['@odata.nextLink']
267
+ break if next_link.nil?
268
+ response = request_json(:url => next_link)
269
+ end
270
+ end
271
+
272
+ def base_headers
273
+ {
274
+ 'Authorization' => "Bearer #{access_token}",
275
+ 'Content-Type' => 'application/json'
276
+ }
277
+ end
278
+
279
+ def raise_any_errors(response, url:, query_params: {})
280
+ if HTTP::Status.successful?(response.status)
281
+ response
282
+ else
283
+ response_body = response.body.to_s
284
+ error_message = begin
285
+ error = JSON.parse(response_body).fetch('error')
286
+ if error['code'] == 'resyncRequired'
287
+ Connectors::Stats.increment('custom_client.office365.error.invalid_cursors')
288
+ raise Office365InvalidCursorsError.new(response.status, url)
289
+ end
290
+ JSON.parse(error.fetch('message')).fetch('Message').strip
291
+ rescue ClientError
292
+ raise
293
+ rescue StandardError
294
+ "got a #{response.status} from #{url} with query #{query_params}"
295
+ end
296
+ raise ClientError.new(response.status, url), error_message
297
+ end
298
+ end
299
+
300
+ def request_endpoint(endpoint:, query_params: nil)
301
+ url = "#{BASE_URL}#{endpoint}"
302
+ request_json(:url => url, :query_params => query_params)
303
+ end
304
+
305
+ def request_json(url:, query_params: nil)
306
+ response = request(:url => url, :query_params => query_params, :headers => base_headers)
307
+ Hashie::Mash.new(JSON.parse(response.body))
308
+ end
309
+
310
+ def request(url:, query_params: nil, headers: nil, &block)
311
+ raise_any_errors(
312
+ get(url, query_params, headers, &block),
313
+ :url => url,
314
+ :query_params => query_params
315
+ )
316
+ end
317
+ end
318
+ end
319
+ end