connectors_sdk 8.2.0.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,153 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/adapter'
10
+
11
+ module ConnectorsSdk
12
+ module Office365
13
+ class Adapter < ConnectorsSdk::Base::Adapter
14
+ def self.swiftype_document_from_file(_file)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def self.swiftype_document_from_folder(_folder)
19
+ raise NotImplementedError
20
+ end
21
+
22
+ class GraphItem
23
+ attr_reader :item
24
+
25
+ def initialize(item)
26
+ @item = item
27
+ end
28
+
29
+ def self.convert_id_to_fp_id(_id)
30
+ raise NotImplementedError
31
+ end
32
+
33
+ def self.get_path(item)
34
+ parent_reference_path = item.parentReference&.path || ''
35
+ parent_folder_path =
36
+ if parent_reference_path.end_with?('root:')
37
+ ''
38
+ else
39
+ CGI.unescape(parent_reference_path).split('root:').last
40
+ end
41
+ ConnectorsSdk::Office365::Adapter.normalize_path("#{parent_folder_path}/#{item.name}")
42
+ end
43
+
44
+ def to_swiftype_document
45
+ {
46
+ :_fields_to_preserve => ConnectorsSdk::Office365::Adapter.fields_to_preserve,
47
+ :id => self.class.convert_id_to_fp_id(item.id),
48
+ :path => get_path(item),
49
+ :title => item.name,
50
+ :url => item.webUrl,
51
+ :type => ConnectorsSdk::Base::Adapter.normalize_enum(type),
52
+ :created_by => created_by(item),
53
+ :created_at => ConnectorsSdk::Base::Adapter.normalize_date(item.createdDateTime),
54
+ :last_updated => ConnectorsSdk::Base::Adapter.normalize_date(item.lastModifiedDateTime),
55
+ :updated_by => last_modified_by(item),
56
+ :drive_owner => item.drive_owner_name
57
+ }.merge(fields).merge(permissions)
58
+ end
59
+
60
+ private
61
+
62
+ def get_path(item)
63
+ ConnectorsSdk::Office365::Adapter::GraphItem.get_path(item)
64
+ end
65
+
66
+ def type
67
+ raise NotImplementedError
68
+ end
69
+
70
+ def fields
71
+ raise NotImplementedError
72
+ end
73
+
74
+ def created_by(item)
75
+ item.createdBy&.user&.displayName
76
+ end
77
+
78
+ def last_modified_by(item)
79
+ item.lastModifiedBy&.user&.displayName
80
+ end
81
+
82
+ def permissions
83
+ if item.permissions.present?
84
+ {
85
+ ConnectorsShared::Constants::ALLOW_FIELD => item.permissions.map do |next_permission|
86
+ [
87
+ next_permission.dig(:grantedTo, :user, :id),
88
+ next_permission.dig(:grantedTo, :user, :displayName)
89
+ ].compact
90
+ end.flatten.uniq
91
+ }
92
+ else
93
+ {}
94
+ end
95
+ end
96
+ end
97
+
98
+ class FileGraphItem < GraphItem
99
+ def self.convert_id_to_fp_id(_id)
100
+ raise NotImplementedError
101
+ end
102
+
103
+ private
104
+
105
+ def type
106
+ 'file'
107
+ end
108
+
109
+ def fields
110
+ # FIXME: potentially add `updated_by_email`
111
+ {
112
+ :title => ConnectorsSdk::Base::Adapter.strip_file_extension(item.name),
113
+ :mime_type => ConnectorsSdk::Base::Adapter.mime_type_for_file(item.name),
114
+ :extension => ConnectorsSdk::Base::Adapter.extension_for_file(item.name)
115
+ }
116
+ end
117
+ end
118
+
119
+ class FolderGraphItem < GraphItem
120
+
121
+ private
122
+
123
+ def type
124
+ 'folder'
125
+ end
126
+
127
+ def fields
128
+ {
129
+ :title => item.root ? item.drive_name : item.name
130
+ }
131
+ end
132
+ end
133
+
134
+ class PackageGraphItem < GraphItem
135
+ def self.convert_id_to_fp_id(id)
136
+ raise NotImplementedError
137
+ end
138
+
139
+ private
140
+
141
+ def type
142
+ # MSFT gives packages as 'oneNote' and it should be called 'OneNote'
143
+ item.package.type.classify
144
+ end
145
+
146
+ def fields
147
+ {}
148
+ end
149
+
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,37 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/config'
10
+
11
+ module ConnectorsSdk
12
+ module Office365
13
+ class Config < ConnectorsSdk::Base::Config
14
+ ALL_DRIVE_IDS = 'all'.freeze
15
+
16
+ attr_reader :drive_ids, :index_permissions
17
+
18
+ def initialize(drive_ids:, cursors:, index_permissions: false)
19
+ super(:cursors => cursors)
20
+ @cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY] ||= {}
21
+ @drive_ids = drive_ids
22
+ @index_permissions = index_permissions
23
+ end
24
+
25
+ def index_all_drives?
26
+ drive_ids == ALL_DRIVE_IDS
27
+ end
28
+
29
+ def to_h
30
+ super.merge(
31
+ :drive_ids => drive_ids,
32
+ :index_permissions => index_permissions
33
+ )
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,339 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/custom_client'
10
+ require 'connectors_shared'
11
+ require 'hashie/mash'
12
+
13
+ module ConnectorsSdk
14
+ module Office365
15
+ class CustomClient < ConnectorsSdk::Base::CustomClient
16
+
17
+ OFFICE365_PERMISSION_SYNC_TIME_SLA = 24.hours
18
+
19
+ class ClientError < ConnectorsShared::ClientError
20
+ attr_reader :status_code, :endpoint
21
+
22
+ def initialize(status_code, endpoint)
23
+ @status_code = status_code
24
+ @endpoint = endpoint
25
+ end
26
+ end
27
+
28
+ class Office365InvalidCursorsError < ClientError; end
29
+
30
+ # This is necessary because `Faraday::NestedParamsEncoder.encode` changes the
31
+ # order of params, which Microsoft's download API can't handle for some reason.
32
+ module Office365DownloadParamsEncoder
33
+ class << self
34
+ extend Forwardable
35
+ def_delegators :'Faraday::NestedParamsEncoder', :escape, :decode
36
+
37
+ def encode(params)
38
+ params.map do |key, value|
39
+ "#{escape(key)}=#{escape(value)}"
40
+ end.join('&')
41
+ end
42
+ end
43
+ end
44
+
45
+ attr_reader :access_token
46
+ attr_accessor :cursors
47
+
48
+ BASE_URL = 'https://graph.microsoft.com/v1.0/'.freeze
49
+
50
+ def initialize(access_token:, cursors: {}, ensure_fresh_auth: nil)
51
+ @access_token = access_token
52
+ @cursors = cursors || {}
53
+ @cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY] ||= {}
54
+ super(:ensure_fresh_auth => ensure_fresh_auth)
55
+ end
56
+
57
+ def update_auth_data!(new_access_token)
58
+ @access_token = new_access_token
59
+ self
60
+ end
61
+
62
+ def me
63
+ request_endpoint(:endpoint => 'me')
64
+ end
65
+
66
+ def one_drive_drives(fields: [])
67
+ query_params = transform_fields_to_request_query_params(fields)
68
+ response = request_endpoint(:endpoint => 'me/drives/', :query_params => query_params)
69
+ response.value
70
+ end
71
+
72
+ def share_point_drives(fields: [])
73
+ # When new Private Team site is created in SharePoint, permissions take some time to propagate, therefore
74
+ # this site won't be indexed by us until propagation happens. This code tries to also fetch sites from
75
+ # recently created groups (new Private Team site will be there) to reduce friction and index this site
76
+ # earlier.
77
+ # See: https://github.com/elastic/ent-search/pull/3581
78
+ share_point_sites = (sites(:fields => %w[id]) + recent_share_point_group_sites(:fields => %[id]))
79
+
80
+ share_point_sites
81
+ .map(&:id)
82
+ .uniq
83
+ .map { |site_id| site_drives(site_id, :fields => fields) }
84
+ .flatten
85
+ .compact
86
+ end
87
+
88
+ def groups(fields: [])
89
+ request_all(:endpoint => 'groups/', :fields => fields)
90
+ end
91
+
92
+ def group_root_site(group_id, fields: [])
93
+ query_params = transform_fields_to_request_query_params(fields)
94
+
95
+ request_endpoint(:endpoint => "groups/#{group_id}/sites/root", :query_params => query_params)
96
+ end
97
+
98
+ def sites(fields: [])
99
+ # This empty search string ends up returning all sites. If we leave it off, the API returns a 400
100
+ # I explicity set the page size here (via :top) because otherwise the API just returns the first ten and
101
+ # does not provide any additional pages.
102
+ request_all(:endpoint => 'sites/', :fields => fields, :additional_query_params => { :search => '', :top => 10 })
103
+ end
104
+
105
+ def site_drives(site_id, fields: [])
106
+ document_libraries(
107
+ request_all(:endpoint => "sites/#{site_id}/drives/", :fields => fields)
108
+ )
109
+ rescue ClientError => e
110
+ ConnectorsShared::Logger.info("Received response of #{e.status_code} trying to get drive for Site with Id = #{site_id}: #{e.message}")
111
+ nil
112
+ end
113
+
114
+ def list_items(drive_id, fields: [], break_after_page: false)
115
+ # MSFT Graph API does not have a recursive list items, have to do this dfs style
116
+
117
+ stack = if break_after_page && cursors['page_cursor'].present?
118
+ cursors.delete('page_cursor')
119
+ else
120
+ [get_root_item(drive_id, ['id']).id]
121
+ end
122
+
123
+ # We rely on the id field below to perform our DFS
124
+ fields_with_id = fields.any? ? fields | ['id'] : fields
125
+ yielded = 0
126
+ while stack.any?
127
+ folder_id = stack.pop
128
+ item_children(drive_id, folder_id, :fields => fields_with_id, :break_after_page => break_after_page) do |item|
129
+ if item.folder
130
+ stack << item.id
131
+ end
132
+ yield item
133
+
134
+ yielded += 1
135
+ end
136
+
137
+ if break_after_page && yielded >= 100
138
+ if cursors['item_children_next_link'].present?
139
+ stack << folder_id
140
+ end
141
+ if stack.any?
142
+ cursors['page_cursor'] = stack.dup
143
+ break
144
+ end
145
+ end
146
+ end
147
+ end
148
+
149
+ def item_permissions(drive_id, item_id)
150
+ request_endpoint(:endpoint => "drives/#{drive_id}/items/#{item_id}/permissions").value
151
+ end
152
+
153
+ def list_changes(drive_id:, start_delta_link: nil, last_modified: nil, break_after_page: false)
154
+ query_params = { :'$select' => %w(id content.downloadUrl lastModifiedDateTime lastModifiedBy root deleted file folder package name webUrl createdBy createdDateTime size).join(',') }
155
+ response =
156
+ if break_after_page && cursors['page_cursor'].present?
157
+ request_json(:url => cursors.delete('page_cursor'))
158
+ elsif start_delta_link.nil?
159
+ endpoint = "drives/#{drive_id}/root/delta"
160
+ request_endpoint(:endpoint => endpoint, :query_params => query_params)
161
+ else
162
+ request_json(:url => start_delta_link, :query_params => query_params)
163
+ end
164
+
165
+ yielded = 0
166
+ loop do
167
+ response.value.each do |change|
168
+ # MSFT Graph API does not allow us to view "changes" in chronological order, so if there is no cursor,
169
+ # we have to iterate through all changes and cherry-pick the ones that are past the `last_modified` Time
170
+ # since to get another cursor, we would have to go through all the changes anyway
171
+ next if last_modified.present? && Time.parse(change.lastModifiedDateTime) < last_modified
172
+ next if change.root # We don't want to index the root of the drive
173
+
174
+ yield change
175
+ yielded += 1
176
+ end
177
+
178
+ if break_after_page && yielded >= 100 && response['@odata.nextLink'].present?
179
+ cursors['page_cursor'] = response['@odata.nextLink']
180
+ break
181
+ end
182
+
183
+ break if response['@odata.nextLink'].nil?
184
+ response = request_json(:url => response['@odata.nextLink'])
185
+ end
186
+
187
+ cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] = response['@odata.deltaLink']
188
+ end
189
+
190
+ def get_latest_delta_link(drive_id)
191
+ cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] || exhaustively_get_delta_link(drive_id)
192
+ end
193
+
194
+ def exhaustively_get_delta_link(drive_id)
195
+ endpoint = "drives/#{drive_id}/root/delta"
196
+
197
+ Connectors::Stats.measure('custom_client.office365.exhaustively_get_delta_link') do
198
+ response = request_endpoint(:endpoint => endpoint, :query_params => { :'$select' => 'id' })
199
+
200
+ while next_link = response['@odata.nextLink']
201
+ response = request_json(:url => next_link)
202
+ end
203
+
204
+ response['@odata.deltaLink'].split('?').first
205
+ end
206
+ end
207
+
208
+ def download_item(download_url)
209
+ request(:url => download_url) do |request|
210
+ request.options.params_encoder = Office365DownloadParamsEncoder
211
+ end.body
212
+ end
213
+
214
+ def user_groups(user_id, fields = [])
215
+ (
216
+ request_all(
217
+ :endpoint => "users/#{user_id}/transitiveMemberOf",
218
+ :fields => fields
219
+ ) +
220
+ request_all(
221
+ :endpoint => "users/#{user_id}/ownedObjects",
222
+ :fields => fields
223
+ ).select { |next_object| next_object['@odata.type'] == '#microsoft.graph.group' }
224
+ ).uniq
225
+ end
226
+
227
+ private
228
+
229
+ def recent_share_point_group_sites(fields: [])
230
+ # group.createdDateTime field is UTC as stated in documentation:
231
+ # https://docs.microsoft.com/en-us/graph/api/resources/group?view=graph-rest-1.0#properties
232
+ created_date_time_threshold = Time.now.utc - OFFICE365_PERMISSION_SYNC_TIME_SLA
233
+
234
+ groups(:fields => %w(id createdDateTime))
235
+ .select { |group| group.createdDateTime > created_date_time_threshold }
236
+ .map { |group| group_root_site(group.id, :fields => %w[id]) }.compact
237
+ end
238
+
239
+ def document_libraries(drives)
240
+ drives.select { |drive| drive.driveType == 'documentLibrary' }
241
+ end
242
+
243
+ def transform_fields_to_request_query_params(fields = [])
244
+ fields.empty? ? {} : { :'$select' => fields.join(',') }
245
+ end
246
+
247
+ def request_all(endpoint:, fields: [], additional_query_params: {})
248
+ query_params = transform_fields_to_request_query_params(fields)
249
+ response = request_endpoint(:endpoint => endpoint, :query_params => query_params.merge(additional_query_params))
250
+
251
+ items = response.value
252
+ while next_link = response['@odata.nextLink']
253
+ response = request_json(:url => next_link)
254
+ items.concat(response.value)
255
+ end
256
+ items
257
+ end
258
+
259
+ def get_root_item(drive_id, fields = [])
260
+ query_params = transform_fields_to_request_query_params(fields)
261
+ request_endpoint(:endpoint => "drives/#{drive_id}/root", :query_params => query_params)
262
+ end
263
+
264
+ def item_children(drive_id, item_id, fields: [], break_after_page: false, &block)
265
+ next_link = cursors.delete('item_children_next_link') if break_after_page
266
+
267
+ response = if next_link.present?
268
+ request_json(:url => next_link)
269
+ else
270
+ endpoint = "drives/#{drive_id}/items/#{item_id}/children"
271
+ query_params = transform_fields_to_request_query_params(fields)
272
+ request_endpoint(:endpoint => endpoint, :query_params => query_params)
273
+ end
274
+
275
+ yielded = 0
276
+ loop do
277
+ response.value.each(&block)
278
+ next_link = response['@odata.nextLink']
279
+
280
+ break if next_link.nil?
281
+
282
+ yielded += response.value.size
283
+ if break_after_page && yielded >= 100
284
+ cursors['item_children_next_link'] = next_link
285
+ break
286
+ end
287
+
288
+ response = request_json(:url => next_link)
289
+ end
290
+ end
291
+
292
+ def base_headers
293
+ {
294
+ 'Authorization' => "Bearer #{access_token}",
295
+ 'Content-Type' => 'application/json'
296
+ }
297
+ end
298
+
299
+ def raise_any_errors(response, url:, query_params: {})
300
+ if HTTP::Status.successful?(response.status)
301
+ response
302
+ else
303
+ response_body = response.body.to_s
304
+ error_message = begin
305
+ error = JSON.parse(response_body).fetch('error')
306
+ if error['code'] == 'resyncRequired'
307
+ Connectors::Stats.increment('custom_client.office365.error.invalid_cursors')
308
+ raise Office365InvalidCursorsError.new(response.status, url)
309
+ end
310
+ JSON.parse(error.fetch('message')).fetch('Message').strip
311
+ rescue ClientError
312
+ raise
313
+ rescue StandardError
314
+ "got a #{response.status} from #{url} with query #{query_params}"
315
+ end
316
+ raise ClientError.new(response.status, url), error_message
317
+ end
318
+ end
319
+
320
+ def request_endpoint(endpoint:, query_params: nil)
321
+ url = "#{BASE_URL}#{endpoint}"
322
+ request_json(:url => url, :query_params => query_params)
323
+ end
324
+
325
+ def request_json(url:, query_params: nil)
326
+ response = request(:url => url, :query_params => query_params, :headers => base_headers)
327
+ Hashie::Mash.new(JSON.parse(response.body))
328
+ end
329
+
330
+ def request(url:, query_params: nil, headers: nil, &block)
331
+ raise_any_errors(
332
+ get(url, query_params, headers, &block),
333
+ :url => url,
334
+ :query_params => query_params
335
+ )
336
+ end
337
+ end
338
+ end
339
+ end