connectors_sdk 8.3.0.0.pre.20220510T144908Z → 8.3.0.0.pre.20220517T144653Z

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 94b75b9fa3a5f0c46a271a34d073f71977629db6095b6364c8710b0aab92374b
4
- data.tar.gz: d133f34052f43e0b8b65ba10eb946c55a6ef421a6296d2d4438e7f4f92b45696
3
+ metadata.gz: b86ca5e489e3cef3b9f2c04a462baf71d6b43805731b0cb52ba2f56f5078d6d3
4
+ data.tar.gz: 044e860f11163e82c63f66276c3d5628b761c5dfcc5168fc8f80b63ca87d19f0
5
5
  SHA512:
6
- metadata.gz: 712b819efcfa755ce19e4cdb7060dbadefecf37321d07bfe4e7bc9b18254e0f11f5fc29967ff171b1314064d77427bde730ab400a4a2dcde89b31dc6344d4a34
7
- data.tar.gz: fa1fedb0c7449b9b2b50b1a56a710178cd3343a108500938244987eb31c694ab0d0214f383df69b6efa6b384eb558697553813c9b93cb142f011b374e05f2c49
6
+ metadata.gz: 528fa5260cf80a3ebb918478e1be2e7cac1668588853ad72b1259095086090c15a0cbc028523b1de8b775b3b64ed7427c6fab7ffccc562d559c8164456b84c4b
7
+ data.tar.gz: a58d353e2b48ffda33aa287d0fa6bb1400c531aae2794995ff3b644b779426d10bb3efc8735d8e07c353fae64bc0b3b0af58427d989f499210a19244b23c1a35
@@ -74,7 +74,7 @@ module ConnectorsSdk
74
74
  faraday.use(*middleware_config)
75
75
  end
76
76
 
77
- faraday.adapter(:httpclient)
77
+ faraday.adapter :httpclient
78
78
  end
79
79
  end
80
80
 
@@ -22,31 +22,31 @@ module ConnectorsSdk
22
22
  )
23
23
  end
24
24
 
25
- def document_batch(params)
25
+ def extract(params)
26
26
  convert_third_party_errors do
27
- results = []
28
-
29
27
  extractor = extractor(params)
30
28
 
31
- extractor.yield_document_changes(:break_after_page => true, :modified_since => extractor.config.cursors['modified_since']) do |action, doc, download_args_and_proc|
29
+ extractor.yield_document_changes(:modified_since => extractor.config.cursors[:modified_since]) do |action, doc, download_args_and_proc|
32
30
  download_obj = nil
33
31
  if download_args_and_proc
34
32
  download_obj = {
35
- id: download_args_and_proc[0],
36
- name: download_args_and_proc[1],
37
- size: download_args_and_proc[2],
38
- download_args: download_args_and_proc[3]
33
+ id: download_args_and_proc[0],
34
+ name: download_args_and_proc[1],
35
+ size: download_args_and_proc[2],
36
+ download_args: download_args_and_proc[3]
39
37
  }
40
38
  end
41
39
 
42
- results << {
43
- :action => action,
44
- :document => doc,
45
- :download => download_obj
40
+ doc = {
41
+ :action => action,
42
+ :document => doc,
43
+ :download => download_obj
46
44
  }
45
+
46
+ yield doc
47
47
  end
48
48
 
49
- [results, extractor.config.cursors, extractor.completed]
49
+ extractor.config.to_h[:cursors]
50
50
  end
51
51
  end
52
52
 
@@ -20,7 +20,7 @@ module ConnectorsSdk
20
20
 
21
21
  ConnectorsSdk::Base::Extractor::TRANSIENT_SERVER_ERROR_CLASSES << Atlassian::CustomClient::ServiceUnavailableError
22
22
 
23
- def yield_document_changes(modified_since: nil, break_after_page: false)
23
+ def yield_document_changes(modified_since: nil)
24
24
  @space_permissions_cache = {}
25
25
  @content_restriction_cache = {}
26
26
  yield_spaces do |space|
@@ -50,11 +50,6 @@ module ConnectorsSdk
50
50
  yield :create_or_update, Confluence::Adapter.es_document_from_confluence_content(content, content_base_url, restrictions)
51
51
  end
52
52
  end
53
-
54
- if break_after_page
55
- @completed = true
56
- break
57
- end
58
53
  end
59
54
  end
60
55
 
@@ -52,7 +52,6 @@ module ConnectorsSdk
52
52
  def initialize(access_token:, cursors: {}, ensure_fresh_auth: nil)
53
53
  @access_token = access_token
54
54
  @cursors = cursors || {}
55
- @cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY] ||= {}
56
55
  super(:ensure_fresh_auth => ensure_fresh_auth)
57
56
  end
58
57
 
@@ -77,12 +76,11 @@ module ConnectorsSdk
77
76
  # recently created groups (new Private Team site will be there) to reduce friction and index this site
78
77
  # earlier.
79
78
  # See: https://github.com/elastic/ent-search/pull/3581
80
- share_point_sites = (sites(:fields => %w[id]) + recent_share_point_group_sites(:fields => %[id]))
79
+ share_point_sites = (sites(:fields => %w[id,name]) + recent_share_point_group_sites(:fields => %w[id,name]))
81
80
 
82
81
  share_point_sites
83
- .map(&:id)
84
- .uniq
85
- .map { |site_id| site_drives(site_id, :fields => fields) }
82
+ .uniq(&:id)
83
+ .map { |site| site_drives(site, :fields => fields) }
86
84
  .flatten
87
85
  .compact
88
86
  end
@@ -104,47 +102,32 @@ module ConnectorsSdk
104
102
  request_all(:endpoint => 'sites/', :fields => fields, :additional_query_params => { :search => '', :top => 10 })
105
103
  end
106
104
 
107
- def site_drives(site_id, fields: [])
105
+ def site_drives(site, fields: [])
108
106
  document_libraries(
109
- request_all(:endpoint => "sites/#{site_id}/drives/", :fields => fields)
110
- )
107
+ request_all(:endpoint => "sites/#{site.id}/drives/", :fields => fields)
108
+ ).map do |drive|
109
+ drive.site_name = site.name
110
+ drive
111
+ end
111
112
  rescue ClientError => e
112
113
  ConnectorsShared::Logger.info("Received response of #{e.status_code} trying to get drive for Site with Id = #{site_id}: #{e.message}")
113
114
  nil
114
115
  end
115
116
 
116
- def list_items(drive_id, fields: [], break_after_page: false)
117
+ def list_items(drive_id, fields: [])
117
118
  # MSFT Graph API does not have a recursive list items, have to do this dfs style
118
-
119
- stack = if break_after_page && cursors['page_cursor'].present?
120
- cursors.delete('page_cursor')
121
- else
122
- [get_root_item(drive_id, ['id']).id]
123
- end
124
-
119
+ stack = [get_root_item(drive_id, ['id']).id]
125
120
  # We rely on the id field below to perform our DFS
126
121
  fields_with_id = fields.any? ? fields | ['id'] : fields
127
- yielded = 0
128
122
  while stack.any?
129
123
  folder_id = stack.pop
130
- item_children(drive_id, folder_id, :fields => fields_with_id, :break_after_page => break_after_page) do |item|
124
+ item_children(drive_id, folder_id, :fields => fields_with_id) do |item|
131
125
  if item.folder
132
126
  stack << item.id
133
127
  end
134
128
  yield item
135
-
136
- yielded += 1
137
129
  end
138
130
 
139
- if break_after_page && yielded >= 100
140
- if cursors['item_children_next_link'].present?
141
- stack << folder_id
142
- end
143
- if stack.any?
144
- cursors['page_cursor'] = stack.dup
145
- break
146
- end
147
- end
148
131
  end
149
132
  end
150
133
 
@@ -152,19 +135,16 @@ module ConnectorsSdk
152
135
  request_endpoint(:endpoint => "drives/#{drive_id}/items/#{item_id}/permissions").value
153
136
  end
154
137
 
155
- def list_changes(drive_id:, start_delta_link: nil, last_modified: nil, break_after_page: false)
138
+ def list_changes(drive_id:, start_delta_link: nil, last_modified: nil)
156
139
  query_params = { :'$select' => %w(id content.downloadUrl lastModifiedDateTime lastModifiedBy root deleted file folder package name webUrl createdBy createdDateTime size).join(',') }
157
140
  response =
158
- if break_after_page && cursors['page_cursor'].present?
159
- request_json(:url => cursors.delete('page_cursor'))
160
- elsif start_delta_link.nil?
141
+ if start_delta_link.nil?
161
142
  endpoint = "drives/#{drive_id}/root/delta"
162
143
  request_endpoint(:endpoint => endpoint, :query_params => query_params)
163
144
  else
164
145
  request_json(:url => start_delta_link, :query_params => query_params)
165
146
  end
166
147
 
167
- yielded = 0
168
148
  loop do
169
149
  response.value.each do |change|
170
150
  # MSFT Graph API does not allow us to view "changes" in chronological order, so if there is no cursor,
@@ -172,25 +152,18 @@ module ConnectorsSdk
172
152
  # since to get another cursor, we would have to go through all the changes anyway
173
153
  next if last_modified.present? && Time.parse(change.lastModifiedDateTime) < last_modified
174
154
  next if change.root # We don't want to index the root of the drive
175
-
176
155
  yield change
177
- yielded += 1
178
- end
179
-
180
- if break_after_page && yielded >= 100 && response['@odata.nextLink'].present?
181
- cursors['page_cursor'] = response['@odata.nextLink']
182
- break
183
156
  end
184
157
 
185
158
  break if response['@odata.nextLink'].nil?
186
159
  response = request_json(:url => response['@odata.nextLink'])
187
160
  end
188
161
 
189
- cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] = response['@odata.deltaLink']
162
+ cursors[drive_id] = response['@odata.deltaLink']
190
163
  end
191
164
 
192
165
  def get_latest_delta_link(drive_id)
193
- cursors[ConnectorsSdk::Office365::Extractor::DRIVE_IDS_CURSOR_KEY][drive_id] || exhaustively_get_delta_link(drive_id)
166
+ cursors[drive_id] || exhaustively_get_delta_link(drive_id)
194
167
  end
195
168
 
196
169
  def exhaustively_get_delta_link(drive_id)
@@ -210,6 +183,7 @@ module ConnectorsSdk
210
183
  def download_item(download_url)
211
184
  request(:url => download_url) do |request|
212
185
  request.options.params_encoder = Office365DownloadParamsEncoder
186
+ request.options.timeout = 30
213
187
  end.body
214
188
  end
215
189
 
@@ -235,7 +209,7 @@ module ConnectorsSdk
235
209
 
236
210
  groups(:fields => %w(id createdDateTime))
237
211
  .select { |group| group.createdDateTime > created_date_time_threshold }
238
- .map { |group| group_root_site(group.id, :fields => %w[id]) }.compact
212
+ .map { |group| group_root_site(group.id, :fields => fields) }.compact
239
213
  end
240
214
 
241
215
  def document_libraries(drives)
@@ -263,30 +237,15 @@ module ConnectorsSdk
263
237
  request_endpoint(:endpoint => "drives/#{drive_id}/root", :query_params => query_params)
264
238
  end
265
239
 
266
- def item_children(drive_id, item_id, fields: [], break_after_page: false, &block)
267
- next_link = cursors.delete('item_children_next_link') if break_after_page
268
-
269
- response = if next_link.present?
270
- request_json(:url => next_link)
271
- else
272
- endpoint = "drives/#{drive_id}/items/#{item_id}/children"
273
- query_params = transform_fields_to_request_query_params(fields)
274
- request_endpoint(:endpoint => endpoint, :query_params => query_params)
275
- end
240
+ def item_children(drive_id, item_id, fields: [], &block)
241
+ endpoint = "drives/#{drive_id}/items/#{item_id}/children"
242
+ query_params = transform_fields_to_request_query_params(fields)
243
+ response = request_endpoint(:endpoint => endpoint, :query_params => query_params)
276
244
 
277
- yielded = 0
278
245
  loop do
279
246
  response.value.each(&block)
280
247
  next_link = response['@odata.nextLink']
281
-
282
248
  break if next_link.nil?
283
-
284
- yielded += response.value.size
285
- if break_after_page && yielded >= 100
286
- cursors['item_children_next_link'] = next_link
287
- break
288
- end
289
-
290
249
  response = request_json(:url => next_link)
291
250
  end
292
251
  end
@@ -14,52 +14,34 @@ module ConnectorsSdk
14
14
  class Extractor < ConnectorsSdk::Base::Extractor
15
15
  DRIVE_IDS_CURSOR_KEY = 'drive_ids'.freeze
16
16
 
17
- def yield_document_changes(modified_since: nil, break_after_page: false, &block)
17
+ def yield_document_changes(modified_since: nil, &block)
18
18
  drives_to_index.each do |drive|
19
19
  drive_id = drive.id
20
-
21
- if break_after_page
22
- current_drive_id = config.cursors['current_drive_id']
23
- if current_drive_id.present? && current_drive_id > drive_id # they come alpha sorted
24
- next
25
- end
26
- config.cursors['current_drive_id'] = drive_id
27
- end
28
-
29
20
  drive_owner_name = drive.dig(:owner, :user, :displayName)
30
21
  drive_name = drive.name
22
+ site_name = drive.site_name
31
23
 
32
24
  drive_id_to_delta_link = config.cursors.fetch(DRIVE_IDS_CURSOR_KEY, {})
33
25
  begin
34
26
  if start_delta_link = drive_id_to_delta_link[drive_id]
35
27
  log_debug("Starting an incremental crawl with cursor for #{service_type.classify} with drive_id: #{drive_id}")
36
28
  begin
37
- yield_changes(drive_id, :start_delta_link => start_delta_link, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :break_after_page => break_after_page, &block)
29
+ yield_changes(drive_id, :start_delta_link => start_delta_link, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :site_name => site_name, &block)
38
30
  rescue ConnectorsSdk::Office365::CustomClient::Office365InvalidCursorsError
39
31
  log_warn("Error listing changes with start_delta_link: #{start_delta_link}, falling back to full crawl")
40
- yield_drive_items(drive_id, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :break_after_page => break_after_page, &block)
32
+ yield_drive_items(drive_id, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :site_name => site_name, &block)
41
33
  end
42
34
  elsif modified_since.present?
43
35
  log_debug("Starting an incremental crawl using last_modified (no cursor found) for #{service_type.classify} with drive_id: #{drive_id}")
44
- yield_changes(drive_id, :last_modified => modified_since, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :break_after_page => break_after_page, &block)
36
+ yield_changes(drive_id, :last_modified => modified_since, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :site_name => site_name, &block)
45
37
  else
46
38
  log_debug("Starting a full crawl #{service_type.classify} with drive_id: #{drive_id}")
47
- yield_drive_items(drive_id, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :break_after_page => break_after_page, &block)
39
+ yield_drive_items(drive_id, :drive_owner_name => drive_owner_name, :drive_name => drive_name, :site_name => site_name, &block)
48
40
  end
49
41
  rescue ConnectorsSdk::Office365::CustomClient::ClientError => e
50
42
  log_warn("Error searching and listing drive #{drive_id}")
51
43
  capture_exception(e)
52
44
  end
53
-
54
- if break_after_page && (config.cursors['page_cursor'].present? || config.cursors['item_children_next_link'].present?)
55
- break
56
- end
57
- end
58
-
59
- if break_after_page && config.cursors['page_cursor'].blank? && config.cursors['item_children_next_link'].blank?
60
- @completed = true
61
- config.overwrite_cursors!(retrieve_latest_cursors)
62
- log_debug("Completed #{modified_since.nil? ? 'full' : 'incremental'} extraction")
63
45
  end
64
46
 
65
47
  nil
@@ -156,11 +138,12 @@ module ConnectorsSdk
156
138
  ConnectorsShared::ExceptionTracking.capture_exception(office365_client_error, options)
157
139
  end
158
140
 
159
- def yield_drive_items(drive_id, drive_owner_name:, drive_name:, break_after_page: false, &block)
160
- client.list_items(drive_id, break_after_page: break_after_page) do |item|
141
+ def yield_drive_items(drive_id, drive_owner_name:, drive_name:, site_name:, &block)
142
+ client.list_items(drive_id) do |item|
161
143
  yield_single_document_change(:identifier => "Office365 change: #{item&.id} (#{Office365::Adapter::GraphItem.get_path(item)})") do
162
144
  item.drive_owner_name = drive_owner_name
163
145
  item.drive_name = drive_name
146
+ item.site_name = site_name
164
147
  yield_create_or_update(drive_id, item, &block)
165
148
  end
166
149
  end
@@ -174,11 +157,12 @@ module ConnectorsSdk
174
157
  end
175
158
  end
176
159
 
177
- def yield_changes(drive_id, drive_owner_name:, drive_name:, start_delta_link: nil, last_modified: nil, break_after_page: false, &block)
178
- client.list_changes(:drive_id => drive_id, :start_delta_link => start_delta_link, :last_modified => last_modified, :break_after_page => break_after_page) do |item|
160
+ def yield_changes(drive_id, drive_owner_name:, drive_name:, site_name:, start_delta_link: nil, last_modified: nil, &block)
161
+ client.list_changes(:drive_id => drive_id, :start_delta_link => start_delta_link, :last_modified => last_modified) do |item|
179
162
  yield_single_document_change(:identifier => "Office365 change: #{item&.id} (#{Office365::Adapter::GraphItem.get_path(item)})") do
180
163
  item.drive_owner_name = drive_owner_name
181
164
  item.drive_name = drive_name
165
+ item.site_name = site_name
182
166
  yield_correct_actions_and_converted_item(drive_id, item, &block)
183
167
  end
184
168
  end
@@ -10,6 +10,12 @@ require 'connectors_sdk/office365/adapter'
10
10
 
11
11
  module ConnectorsSdk
12
12
  module SharePoint
13
+ module SitePrefix
14
+ def get_path(item)
15
+ item.site_name.present? ? "/sites/#{item.site_name}#{super}" : super
16
+ end
17
+ end
18
+
13
19
  class Adapter < Office365::Adapter
14
20
  generate_id_helpers :share_point, 'share_point'
15
21
 
@@ -26,18 +32,24 @@ module ConnectorsSdk
26
32
  end
27
33
 
28
34
  class FileGraphItem < Office365::Adapter::FileGraphItem
35
+ include SitePrefix
36
+
29
37
  def self.convert_id_to_es_id(id)
30
38
  ConnectorsSdk::SharePoint::Adapter.share_point_id_to_es_id(id)
31
39
  end
32
40
  end
33
41
 
34
42
  class FolderGraphItem < Office365::Adapter::FolderGraphItem
43
+ include SitePrefix
44
+
35
45
  def self.convert_id_to_es_id(id)
36
46
  ConnectorsSdk::SharePoint::Adapter.share_point_id_to_es_id(id)
37
47
  end
38
48
  end
39
49
 
40
50
  class PackageGraphItem < Office365::Adapter::PackageGraphItem
51
+ include SitePrefix
52
+
41
53
  def self.convert_id_to_es_id(id)
42
54
  ConnectorsSdk::SharePoint::Adapter.share_point_id_to_es_id(id)
43
55
  end
@@ -0,0 +1,18 @@
1
+ #
2
+ # Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
3
+ # or more contributor license agreements. Licensed under the Elastic License;
4
+ # you may not use this file except in compliance with the Elastic License.
5
+ #
6
+
7
+ module ConnectorsShared
8
+ class JobStatus
9
+ CREATED = 'created'
10
+ RUNNING = 'running'
11
+ FINISHED = 'finished'
12
+ FAILED = 'failed'
13
+
14
+ def self.is_valid?(status)
15
+ [CREATED, RUNNING, FINISHED, FAILED].include? status
16
+ end
17
+ end
18
+ end
@@ -8,5 +8,6 @@ require 'connectors_shared/constants'
8
8
  require 'connectors_shared/errors'
9
9
  require 'connectors_shared/exception_tracking'
10
10
  require 'connectors_shared/extension_mapping_util'
11
+ require 'connectors_shared/job_status'
11
12
  require 'connectors_shared/logger'
12
13
  require 'connectors_shared/monitor'
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: connectors_sdk
3
3
  version: !ruby/object:Gem::Version
4
- version: 8.3.0.0.pre.20220510T144908Z
4
+ version: 8.3.0.0.pre.20220517T144653Z
5
5
  platform: ruby
6
6
  authors:
7
7
  - Elastic
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-05-10 00:00:00.000000000 Z
11
+ date: 2022-05-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: activesupport
@@ -92,6 +92,7 @@ files:
92
92
  - lib/connectors_shared/exception_tracking.rb
93
93
  - lib/connectors_shared/extension_mapping_util.rb
94
94
  - lib/connectors_shared/extraction_utils.rb
95
+ - lib/connectors_shared/job_status.rb
95
96
  - lib/connectors_shared/logger.rb
96
97
  - lib/connectors_shared/middleware/basic_auth.rb
97
98
  - lib/connectors_shared/middleware/bearer_auth.rb
@@ -102,7 +103,7 @@ homepage: https://github.com/elastic/connectors
102
103
  licenses:
103
104
  - Elastic-2.0
104
105
  metadata:
105
- revision: fb1187beef857b555633e1804eef3ed5e586091d
106
+ revision: 9f25f35e17ffb36dfda754d657794ed9b5d2d75a
106
107
  repository: git@github.com:elastic/connectors.git
107
108
  post_install_message:
108
109
  rdoc_options: []