connectors_sdk 8.3.0.0.pre.20220414T060419Z

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,153 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/adapter'
10
+
11
+ module ConnectorsSdk
12
+ module Office365
13
+ class Adapter < ConnectorsSdk::Base::Adapter
14
+ def self.swiftype_document_from_file(_file)
15
+ raise NotImplementedError
16
+ end
17
+
18
+ def self.swiftype_document_from_folder(_folder)
19
+ raise NotImplementedError
20
+ end
21
+
22
+ class GraphItem
23
+ attr_reader :item
24
+
25
+ def initialize(item)
26
+ @item = item
27
+ end
28
+
29
+ def self.convert_id_to_fp_id(_id)
30
+ raise NotImplementedError
31
+ end
32
+
33
+ def self.get_path(item)
34
+ parent_reference_path = item.parentReference&.path || ''
35
+ parent_folder_path =
36
+ if parent_reference_path.end_with?('root:')
37
+ ''
38
+ else
39
+ CGI.unescape(parent_reference_path).split('root:').last
40
+ end
41
+ ConnectorsSdk::Office365::Adapter.normalize_path("#{parent_folder_path}/#{item.name}")
42
+ end
43
+
44
+ def to_swiftype_document
45
+ {
46
+ :_fields_to_preserve => ConnectorsSdk::Office365::Adapter.fields_to_preserve,
47
+ :id => self.class.convert_id_to_fp_id(item.id),
48
+ :path => get_path(item),
49
+ :title => item.name,
50
+ :url => item.webUrl,
51
+ :type => ConnectorsSdk::Base::Adapter.normalize_enum(type),
52
+ :created_by => created_by(item),
53
+ :created_at => ConnectorsSdk::Base::Adapter.normalize_date(item.createdDateTime),
54
+ :last_updated => ConnectorsSdk::Base::Adapter.normalize_date(item.lastModifiedDateTime),
55
+ :updated_by => last_modified_by(item),
56
+ :drive_owner => item.drive_owner_name
57
+ }.merge(fields).merge(permissions)
58
+ end
59
+
60
+ private
61
+
62
+ def get_path(item)
63
+ ConnectorsSdk::Office365::Adapter::GraphItem.get_path(item)
64
+ end
65
+
66
+ def type
67
+ raise NotImplementedError
68
+ end
69
+
70
+ def fields
71
+ raise NotImplementedError
72
+ end
73
+
74
+ def created_by(item)
75
+ item.createdBy&.user&.displayName
76
+ end
77
+
78
+ def last_modified_by(item)
79
+ item.lastModifiedBy&.user&.displayName
80
+ end
81
+
82
+ def permissions
83
+ if item.permissions.present?
84
+ {
85
+ ConnectorsShared::Constants::ALLOW_FIELD => item.permissions.map do |next_permission|
86
+ [
87
+ next_permission.dig(:grantedTo, :user, :id),
88
+ next_permission.dig(:grantedTo, :user, :displayName)
89
+ ].compact
90
+ end.flatten.uniq
91
+ }
92
+ else
93
+ {}
94
+ end
95
+ end
96
+ end
97
+
98
+ class FileGraphItem < GraphItem
99
+ def self.convert_id_to_fp_id(_id)
100
+ raise NotImplementedError
101
+ end
102
+
103
+ private
104
+
105
+ def type
106
+ 'file'
107
+ end
108
+
109
+ def fields
110
+ # FIXME: potentially add `updated_by_email`
111
+ {
112
+ :title => ConnectorsSdk::Base::Adapter.strip_file_extension(item.name),
113
+ :mime_type => ConnectorsSdk::Base::Adapter.mime_type_for_file(item.name),
114
+ :extension => ConnectorsSdk::Base::Adapter.extension_for_file(item.name)
115
+ }
116
+ end
117
+ end
118
+
119
+ class FolderGraphItem < GraphItem
120
+
121
+ private
122
+
123
+ def type
124
+ 'folder'
125
+ end
126
+
127
+ def fields
128
+ {
129
+ :title => item.root ? item.drive_name : item.name
130
+ }
131
+ end
132
+ end
133
+
134
+ class PackageGraphItem < GraphItem
135
+ def self.convert_id_to_fp_id(id)
136
+ raise NotImplementedError
137
+ end
138
+
139
+ private
140
+
141
+ def type
142
+ # MSFT gives packages as 'oneNote' and it should be called 'OneNote'
143
+ item.package.type.classify
144
+ end
145
+
146
+ def fields
147
+ {}
148
+ end
149
+
150
+ end
151
+ end
152
+ end
153
+ end
@@ -0,0 +1,37 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/config'
10
+
11
+ module ConnectorsSdk
12
+ module Office365
13
+ class Config < ConnectorsSdk::Base::Config
14
+ ALL_DRIVE_IDS = 'all'.freeze
15
+
16
+ attr_reader :drive_ids, :index_permissions
17
+
18
+ def initialize(drive_ids:, cursors:, index_permissions: false)
19
+ super(:cursors => cursors)
20
+ @cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY] ||= {}
21
+ @drive_ids = drive_ids
22
+ @index_permissions = index_permissions
23
+ end
24
+
25
+ def index_all_drives?
26
+ drive_ids == ALL_DRIVE_IDS
27
+ end
28
+
29
+ def to_h
30
+ super.merge(
31
+ :drive_ids => drive_ids,
32
+ :index_permissions => index_permissions
33
+ )
34
+ end
35
+ end
36
+ end
37
+ end
@@ -0,0 +1,319 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ # frozen_string_literal: true
8
+
9
+ require 'connectors_sdk/base/custom_client'
10
+ require 'connectors_shared'
11
+ require 'hashie/mash'
12
+
13
+ module ConnectorsSdk
14
+ module Office365
15
+ class CustomClient < ConnectorsSdk::Base::CustomClient
16
+
17
+ OFFICE365_PERMISSION_SYNC_TIME_SLA = 24.hours
18
+
19
+ class ClientError < ConnectorsShared::ClientError
20
+ attr_reader :status_code, :endpoint
21
+
22
+ def initialize(status_code, endpoint)
23
+ @status_code = status_code
24
+ @endpoint = endpoint
25
+ end
26
+ end
27
+
28
+ class Office365InvalidCursorsError < ClientError; end
29
+
30
+ # This is necessary because `Faraday::NestedParamsEncoder.encode` changes the
31
+ # order of params, which Microsoft's download API can't handle for some reason.
32
+ module Office365DownloadParamsEncoder
33
+ class << self
34
+ extend Forwardable
35
+ def_delegators :'Faraday::NestedParamsEncoder', :escape, :decode
36
+
37
+ def encode(params)
38
+ params.map do |key, value|
39
+ "#{escape(key)}=#{escape(value)}"
40
+ end.join('&')
41
+ end
42
+ end
43
+ end
44
+
45
+ attr_reader :access_token
46
+ attr_accessor :cursors
47
+
48
+ BASE_URL = 'https://graph.microsoft.com/v1.0/'.freeze
49
+
50
+ def initialize(access_token:, cursors: {}, ensure_fresh_auth: nil)
51
+ @access_token = access_token
52
+ @cursors = cursors || {}
53
+ @cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY] ||= {}
54
+ super(:ensure_fresh_auth => ensure_fresh_auth)
55
+ end
56
+
57
+ def update_auth_data!(new_access_token)
58
+ @access_token = new_access_token
59
+ self
60
+ end
61
+
62
+ def me
63
+ request_endpoint(:endpoint => 'me')
64
+ end
65
+
66
+ def one_drive_drives(fields: [])
67
+ query_params = transform_fields_to_request_query_params(fields)
68
+ response = request_endpoint(:endpoint => 'me/drives/', :query_params => query_params)
69
+ response.value
70
+ end
71
+
72
+ def share_point_drives(fields: [])
73
+ # When new Private Team site is created in SharePoint, permissions take some time to propagate, therefore
74
+ # this site won't be indexed by us until propagation happens. This code tries to also fetch sites from
75
+ # recently created groups (new Private Team site will be there) to reduce friction and index this site
76
+ # earlier.
77
+ # See: https://github.com/elastic/ent-search/pull/3581
78
+ share_point_sites = (sites(:fields => %w[id]) + recent_share_point_group_sites(:fields => %[id]))
79
+
80
+ share_point_sites
81
+ .map(&:id)
82
+ .uniq
83
+ .map { |site_id| site_drives(site_id, :fields => fields) }
84
+ .flatten
85
+ .compact
86
+ end
87
+
88
+ def groups(fields: [])
89
+ request_all(:endpoint => 'groups/', :fields => fields)
90
+ end
91
+
92
+ def group_root_site(group_id, fields: [])
93
+ query_params = transform_fields_to_request_query_params(fields)
94
+
95
+ request_endpoint(:endpoint => "groups/#{group_id}/sites/root", :query_params => query_params)
96
+ end
97
+
98
+ def sites(fields: [])
99
+ # This empty search string ends up returning all sites. If we leave it off, the API returns a 400
100
+ # I explicity set the page size here (via :top) because otherwise the API just returns the first ten and
101
+ # does not provide any additional pages.
102
+ request_all(:endpoint => 'sites/', :fields => fields, :additional_query_params => { :search => '', :top => 10 })
103
+ end
104
+
105
+ def site_drives(site_id, fields: [])
106
+ document_libraries(
107
+ request_all(:endpoint => "sites/#{site_id}/drives/", :fields => fields)
108
+ )
109
+ rescue ClientError => e
110
+ ConnectorsShared::Logger.info("Received response of #{e.status_code} trying to get drive for Site with Id = #{site_id}: #{e.message}")
111
+ nil
112
+ end
113
+
114
+ def list_items(drive_id, fields: [], break_after_page: false)
115
+ # MSFT Graph API does not have a recursive list items, have to do this dfs style
116
+
117
+ stack = if break_after_page && cursors['page_cursor'].present?
118
+ cursors.delete('page_cursor')
119
+ else
120
+ [get_root_item(drive_id, ['id']).id]
121
+ end
122
+
123
+ # We rely on the id field below to perform our DFS
124
+ fields_with_id = fields.any? ? fields | ['id'] : fields
125
+ yielded = 0
126
+ while stack.any?
127
+ folder_id = stack.pop
128
+ item_children(drive_id, folder_id, :fields => fields_with_id) do |item|
129
+ if item.folder
130
+ stack << item.id
131
+ end
132
+ yield item
133
+
134
+ yielded += 1
135
+ end
136
+
137
+ if break_after_page && yielded >= 100 && stack.any?
138
+ cursors['page_cursor'] = stack.dup
139
+ break
140
+ end
141
+ end
142
+ end
143
+
144
+ def item_permissions(drive_id, item_id)
145
+ request_endpoint(:endpoint => "drives/#{drive_id}/items/#{item_id}/permissions").value
146
+ end
147
+
148
+ def list_changes(drive_id:, start_delta_link: nil, last_modified: nil, break_after_page: false)
149
+ query_params = { :'$select' => %w(id content.downloadUrl lastModifiedDateTime lastModifiedBy root deleted file folder package name webUrl createdBy createdDateTime size).join(',') }
150
+ response =
151
+ if break_after_page && cursors['page_cursor'].present?
152
+ request_json(:url => cursors.delete('page_cursor'))
153
+ elsif start_delta_link.nil?
154
+ endpoint = "drives/#{drive_id}/root/delta"
155
+ request_endpoint(:endpoint => endpoint, :query_params => query_params)
156
+ else
157
+ request_json(:url => start_delta_link, :query_params => query_params)
158
+ end
159
+
160
+ yielded = 0
161
+ loop do
162
+ response.value.each do |change|
163
+ # MSFT Graph API does not allow us to view "changes" in chronological order, so if there is no cursor,
164
+ # we have to iterate through all changes and cherry-pick the ones that are past the `last_modified` Time
165
+ # since to get another cursor, we would have to go through all the changes anyway
166
+ next if last_modified.present? && Time.parse(change.lastModifiedDateTime) < last_modified
167
+ next if change.root # We don't want to index the root of the drive
168
+
169
+ yield change
170
+ yielded += 1
171
+ end
172
+
173
+ if break_after_page && yielded >= 100 && response['@odata.nextLink'].present?
174
+ cursors['page_cursor'] = response['@odata.nextLink']
175
+ break
176
+ end
177
+
178
+ break if response['@odata.nextLink'].nil?
179
+ response = request_json(:url => response['@odata.nextLink'])
180
+ end
181
+
182
+ cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] = response['@odata.deltaLink']
183
+ end
184
+
185
+ def get_latest_delta_link(drive_id)
186
+ cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] || exhaustively_get_delta_link(drive_id)
187
+ end
188
+
189
+ def exhaustively_get_delta_link(drive_id)
190
+ endpoint = "drives/#{drive_id}/root/delta"
191
+
192
+ Connectors::Stats.measure('custom_client.office365.exhaustively_get_delta_link') do
193
+ response = request_endpoint(:endpoint => endpoint, :query_params => { :'$select' => 'id' })
194
+
195
+ while next_link = response['@odata.nextLink']
196
+ response = request_json(:url => next_link)
197
+ end
198
+
199
+ response['@odata.deltaLink'].split('?').first
200
+ end
201
+ end
202
+
203
+ def download_item(download_url)
204
+ request(:url => download_url) do |request|
205
+ request.options.params_encoder = Office365DownloadParamsEncoder
206
+ end.body
207
+ end
208
+
209
+ def user_groups(user_id, fields = [])
210
+ (
211
+ request_all(
212
+ :endpoint => "users/#{user_id}/transitiveMemberOf",
213
+ :fields => fields
214
+ ) +
215
+ request_all(
216
+ :endpoint => "users/#{user_id}/ownedObjects",
217
+ :fields => fields
218
+ ).select { |next_object| next_object['@odata.type'] == '#microsoft.graph.group' }
219
+ ).uniq
220
+ end
221
+
222
+ private
223
+
224
+ def recent_share_point_group_sites(fields: [])
225
+ # group.createdDateTime field is UTC as stated in documentation:
226
+ # https://docs.microsoft.com/en-us/graph/api/resources/group?view=graph-rest-1.0#properties
227
+ created_date_time_threshold = Time.now.utc - OFFICE365_PERMISSION_SYNC_TIME_SLA
228
+
229
+ groups(:fields => %w(id createdDateTime))
230
+ .select { |group| group.createdDateTime > created_date_time_threshold }
231
+ .map { |group| group_root_site(group.id, :fields => %w[id]) }.compact
232
+ end
233
+
234
+ def document_libraries(drives)
235
+ drives.select { |drive| drive.driveType == 'documentLibrary' }
236
+ end
237
+
238
+ def transform_fields_to_request_query_params(fields = [])
239
+ fields.empty? ? {} : { :'$select' => fields.join(',') }
240
+ end
241
+
242
+ def request_all(endpoint:, fields: [], additional_query_params: {})
243
+ query_params = transform_fields_to_request_query_params(fields)
244
+ response = request_endpoint(:endpoint => endpoint, :query_params => query_params.merge(additional_query_params))
245
+
246
+ items = response.value
247
+ while next_link = response['@odata.nextLink']
248
+ response = request_json(:url => next_link)
249
+ items.concat(response.value)
250
+ end
251
+ items
252
+ end
253
+
254
+ def get_root_item(drive_id, fields = [])
255
+ query_params = transform_fields_to_request_query_params(fields)
256
+ request_endpoint(:endpoint => "drives/#{drive_id}/root", :query_params => query_params)
257
+ end
258
+
259
+ def item_children(drive_id, item_id, fields: [], &block)
260
+ endpoint = "drives/#{drive_id}/items/#{item_id}/children"
261
+ query_params = transform_fields_to_request_query_params(fields)
262
+ response = request_endpoint(:endpoint => endpoint, :query_params => query_params)
263
+
264
+ loop do
265
+ response.value.each(&block)
266
+ next_link = response['@odata.nextLink']
267
+ break if next_link.nil?
268
+ response = request_json(:url => next_link)
269
+ end
270
+ end
271
+
272
+ def base_headers
273
+ {
274
+ 'Authorization' => "Bearer #{access_token}",
275
+ 'Content-Type' => 'application/json'
276
+ }
277
+ end
278
+
279
+ def raise_any_errors(response, url:, query_params: {})
280
+ if HTTP::Status.successful?(response.status)
281
+ response
282
+ else
283
+ response_body = response.body.to_s
284
+ error_message = begin
285
+ error = JSON.parse(response_body).fetch('error')
286
+ if error['code'] == 'resyncRequired'
287
+ Connectors::Stats.increment('custom_client.office365.error.invalid_cursors')
288
+ raise Office365InvalidCursorsError.new(response.status, url)
289
+ end
290
+ JSON.parse(error.fetch('message')).fetch('Message').strip
291
+ rescue ClientError
292
+ raise
293
+ rescue StandardError
294
+ "got a #{response.status} from #{url} with query #{query_params}"
295
+ end
296
+ raise ClientError.new(response.status, url), error_message
297
+ end
298
+ end
299
+
300
+ def request_endpoint(endpoint:, query_params: nil)
301
+ url = "#{BASE_URL}#{endpoint}"
302
+ request_json(:url => url, :query_params => query_params)
303
+ end
304
+
305
+ def request_json(url:, query_params: nil)
306
+ response = request(:url => url, :query_params => query_params, :headers => base_headers)
307
+ Hashie::Mash.new(JSON.parse(response.body))
308
+ end
309
+
310
+ def request(url:, query_params: nil, headers: nil, &block)
311
+ raise_any_errors(
312
+ get(url, query_params, headers, &block),
313
+ :url => url,
314
+ :query_params => query_params
315
+ )
316
+ end
317
+ end
318
+ end
319
+ end