connectors_service 8.6.0.3 → 8.6.0.4.pre.20221114T233727Z
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/config/connectors.yml +9 -10
- data/lib/app/config.rb +2 -0
- data/lib/app/dispatcher.rb +17 -1
- data/lib/app/preflight_check.rb +15 -0
- data/lib/connectors/base/connector.rb +37 -4
- data/lib/connectors/base/simple_rules_parser.rb +42 -0
- data/lib/connectors/connector_status.rb +4 -4
- data/lib/connectors/example/{example_attachments → attachments}/first_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/second_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/third_attachment.txt +0 -0
- data/lib/connectors/example/connector.rb +43 -4
- data/lib/connectors/gitlab/connector.rb +16 -2
- data/lib/connectors/mongodb/connector.rb +173 -50
- data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
- data/lib/connectors/registry.rb +2 -2
- data/lib/connectors/sync_status.rb +23 -4
- data/lib/core/configuration.rb +4 -2
- data/lib/core/connector_job.rb +137 -0
- data/lib/core/connector_settings.rb +29 -18
- data/lib/core/elastic_connector_actions.rb +331 -32
- data/lib/core/filtering/post_process_engine.rb +39 -0
- data/lib/core/filtering/post_process_result.rb +27 -0
- data/lib/core/filtering/simple_rule.rb +141 -0
- data/lib/core/filtering/validation_job_runner.rb +53 -0
- data/lib/{connectors_app/// → core/filtering/validation_status.rb} +9 -5
- data/lib/core/filtering.rb +17 -0
- data/lib/core/ingestion/es_sink.rb +59 -0
- data/lib/core/ingestion/ingester.rb +90 -0
- data/lib/core/{output_sink.rb → ingestion.rb} +2 -5
- data/lib/core/native_scheduler.rb +3 -0
- data/lib/core/scheduler.rb +43 -10
- data/lib/core/single_scheduler.rb +3 -0
- data/lib/core/sync_job_runner.rb +78 -18
- data/lib/core.rb +2 -0
- data/lib/utility/bulk_queue.rb +85 -0
- data/lib/utility/common.rb +20 -0
- data/lib/utility/constants.rb +2 -0
- data/lib/utility/errors.rb +5 -0
- data/lib/utility/es_client.rb +6 -2
- data/lib/utility/filtering.rb +22 -0
- data/lib/utility/logger.rb +2 -1
- data/lib/utility.rb +5 -3
- metadata +27 -18
- data/lib/core/output_sink/base_sink.rb +0 -33
- data/lib/core/output_sink/combined_sink.rb +0 -38
- data/lib/core/output_sink/console_sink.rb +0 -51
- data/lib/core/output_sink/es_sink.rb +0 -74
@@ -10,8 +10,27 @@ require 'active_support/core_ext/hash'
|
|
10
10
|
require 'connectors/connector_status'
|
11
11
|
require 'connectors/sync_status'
|
12
12
|
require 'utility'
|
13
|
+
require 'elastic-transport'
|
13
14
|
|
14
15
|
module Core
|
16
|
+
class JobAlreadyRunningError < StandardError
|
17
|
+
def initialize(connector_id)
|
18
|
+
super("Sync job for connector '#{connector_id}' is already running.")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class JobNotCreatedError < StandardError
|
23
|
+
def initialize(connector_id, response)
|
24
|
+
super("Sync job for connector '#{connector_id}' could not be created. Response: #{response}")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class ConnectorVersionChangedError < StandardError
|
29
|
+
def initialize(connector_id, seq_no, primary_term)
|
30
|
+
super("Version conflict: seq_no [#{seq_no}] and primary_term [#{primary_term}] do not match for connector '#{connector_id}'.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
15
34
|
class ElasticConnectorActions
|
16
35
|
class << self
|
17
36
|
|
@@ -30,10 +49,17 @@ module Core
|
|
30
49
|
end
|
31
50
|
|
32
51
|
def get_connector(connector_id)
|
52
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
33
53
|
client.get(:index => Utility::Constants::CONNECTORS_INDEX, :id => connector_id, :ignore => 404).with_indifferent_access
|
34
54
|
end
|
35
55
|
|
56
|
+
def get_job(job_id)
|
57
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
58
|
+
client.get(:index => Utility::Constants::JOB_INDEX, :id => job_id, :ignore => 404).with_indifferent_access
|
59
|
+
end
|
60
|
+
|
36
61
|
def connectors_meta
|
62
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
37
63
|
alias_mappings = client.indices.get_mapping(:index => Utility::Constants::CONNECTORS_INDEX).with_indifferent_access
|
38
64
|
index = get_latest_index_in_alias(Utility::Constants::CONNECTORS_INDEX, alias_mappings.keys)
|
39
65
|
alias_mappings.dig(index, 'mappings', '_meta') || {}
|
@@ -52,6 +78,19 @@ module Core
|
|
52
78
|
)
|
53
79
|
end
|
54
80
|
|
81
|
+
def search_jobs(query, page_size, offset)
|
82
|
+
client.search(
|
83
|
+
:index => Utility::Constants::JOB_INDEX,
|
84
|
+
:ignore => 404,
|
85
|
+
:body => {
|
86
|
+
:size => page_size,
|
87
|
+
:from => offset,
|
88
|
+
:query => query,
|
89
|
+
:sort => ['created_at']
|
90
|
+
}
|
91
|
+
)
|
92
|
+
end
|
93
|
+
|
55
94
|
def update_connector_configuration(connector_id, configuration)
|
56
95
|
update_connector_fields(connector_id, :configuration => configuration)
|
57
96
|
end
|
@@ -71,21 +110,90 @@ module Core
|
|
71
110
|
update_connector_configuration(connector_id, payload)
|
72
111
|
end
|
73
112
|
|
113
|
+
def update_filtering_validation(connector_id, filter_validation_results)
|
114
|
+
return if filter_validation_results.empty?
|
115
|
+
|
116
|
+
filtering = get_connector(connector_id).dig(:_source, :filtering)
|
117
|
+
|
118
|
+
case filtering
|
119
|
+
when Hash
|
120
|
+
update_filter_validation(filtering, filter_validation_results)
|
121
|
+
when Array
|
122
|
+
return unless should_update_validations?(filter_validation_results, filtering)
|
123
|
+
|
124
|
+
filtering.each do |filter|
|
125
|
+
update_filter_validation(filter, filter_validation_results)
|
126
|
+
end
|
127
|
+
else
|
128
|
+
Utility::Logger.warn("Elasticsearch returned invalid filtering format: #{filtering}. Skipping validation.")
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
update_connector_fields(connector_id, { :filtering => filtering })
|
133
|
+
end
|
134
|
+
|
74
135
|
def claim_job(connector_id)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
136
|
+
seq_no = nil
|
137
|
+
primary_term = nil
|
138
|
+
sync_in_progress = false
|
139
|
+
connector_record = client.get(
|
140
|
+
:index => Utility::Constants::CONNECTORS_INDEX,
|
141
|
+
:id => connector_id,
|
142
|
+
:ignore => 404,
|
143
|
+
:refresh => true
|
144
|
+
).tap do |response|
|
145
|
+
seq_no = response['_seq_no']
|
146
|
+
primary_term = response['_primary_term']
|
147
|
+
sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
|
148
|
+
end
|
149
|
+
if sync_in_progress
|
150
|
+
raise JobAlreadyRunningError.new(connector_id)
|
151
|
+
end
|
152
|
+
update_connector_fields(
|
153
|
+
connector_id,
|
154
|
+
{ :sync_now => false,
|
155
|
+
:last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
|
156
|
+
:last_synced => Time.now },
|
157
|
+
seq_no,
|
158
|
+
primary_term
|
159
|
+
)
|
79
160
|
|
80
161
|
body = {
|
81
|
-
:connector_id => connector_id,
|
82
162
|
:status => Connectors::SyncStatus::IN_PROGRESS,
|
83
163
|
:worker_hostname => Socket.gethostname,
|
84
|
-
:created_at => Time.now
|
164
|
+
:created_at => Time.now,
|
165
|
+
:started_at => Time.now,
|
166
|
+
:last_seen => Time.now,
|
167
|
+
:connector => {
|
168
|
+
:id => connector_id,
|
169
|
+
:filtering => convert_connector_filtering_to_job_filtering(connector_record.dig('_source', 'filtering'))
|
170
|
+
}
|
85
171
|
}
|
86
|
-
job = client.index(:index => Utility::Constants::JOB_INDEX, :body => body)
|
87
172
|
|
88
|
-
|
173
|
+
index_response = client.index(:index => Utility::Constants::JOB_INDEX, :body => body, :refresh => true)
|
174
|
+
if index_response['result'] == 'created'
|
175
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
176
|
+
return client.get(
|
177
|
+
:index => Utility::Constants::JOB_INDEX,
|
178
|
+
:id => index_response['_id'],
|
179
|
+
:ignore => 404
|
180
|
+
).with_indifferent_access
|
181
|
+
end
|
182
|
+
raise JobNotCreatedError.new(connector_id, index_response)
|
183
|
+
end
|
184
|
+
|
185
|
+
def convert_connector_filtering_to_job_filtering(connector_filtering)
|
186
|
+
return [] unless connector_filtering
|
187
|
+
connector_filtering = [connector_filtering] unless connector_filtering.is_a?(Array)
|
188
|
+
connector_filtering.each_with_object([]) do |filtering_domain, job_filtering|
|
189
|
+
snippet = filtering_domain.dig('active', 'advanced_snippet') || {}
|
190
|
+
job_filtering << {
|
191
|
+
'domain' => filtering_domain['domain'],
|
192
|
+
'rules' => filtering_domain.dig('active', 'rules'),
|
193
|
+
'advanced_snippet' => snippet['value'] || snippet,
|
194
|
+
'warnings' => [] # TODO: in https://github.com/elastic/enterprise-search-team/issues/3174
|
195
|
+
}
|
196
|
+
end
|
89
197
|
end
|
90
198
|
|
91
199
|
def update_connector_status(connector_id, status, error_message = nil)
|
@@ -99,22 +207,33 @@ module Core
|
|
99
207
|
update_connector_fields(connector_id, body)
|
100
208
|
end
|
101
209
|
|
102
|
-
def
|
103
|
-
|
210
|
+
def update_sync(job_id, metadata)
|
211
|
+
body = {
|
212
|
+
:doc => { :last_seen => Time.now }.merge(metadata)
|
213
|
+
}
|
214
|
+
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
215
|
+
end
|
216
|
+
|
217
|
+
def complete_sync(connector_id, job_id, metadata, error)
|
218
|
+
sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
|
219
|
+
|
220
|
+
metadata ||= {}
|
104
221
|
|
105
222
|
update_connector_fields(connector_id,
|
106
223
|
:last_sync_status => sync_status,
|
107
|
-
:last_sync_error =>
|
108
|
-
:error =>
|
224
|
+
:last_sync_error => error,
|
225
|
+
:error => error,
|
109
226
|
:last_synced => Time.now,
|
110
|
-
:last_indexed_document_count =>
|
111
|
-
:last_deleted_document_count =>
|
227
|
+
:last_indexed_document_count => metadata[:indexed_document_count],
|
228
|
+
:last_deleted_document_count => metadata[:deleted_document_count])
|
112
229
|
|
113
230
|
body = {
|
114
231
|
:doc => {
|
115
232
|
:status => sync_status,
|
116
|
-
:completed_at => Time.now
|
117
|
-
|
233
|
+
:completed_at => Time.now,
|
234
|
+
:last_seen => Time.now,
|
235
|
+
:error => error
|
236
|
+
}.merge(metadata)
|
118
237
|
}
|
119
238
|
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
120
239
|
end
|
@@ -136,7 +255,7 @@ module Core
|
|
136
255
|
}
|
137
256
|
loop do
|
138
257
|
response = client.search(:body => body)
|
139
|
-
hits = response
|
258
|
+
hits = response.dig('hits', 'hits') || []
|
140
259
|
|
141
260
|
ids = hits.map { |h| h['_id'] }
|
142
261
|
result += ids
|
@@ -202,12 +321,105 @@ module Core
|
|
202
321
|
:properties => {
|
203
322
|
:api_key_id => { :type => :keyword },
|
204
323
|
:configuration => { :type => :object },
|
205
|
-
:
|
324
|
+
:description => { :type => :text },
|
325
|
+
:error => { :type => :keyword },
|
326
|
+
:features => {
|
327
|
+
:properties => {
|
328
|
+
:filtering_advanced_config => { :type => :boolean },
|
329
|
+
:filtering_rules => { :type => :boolean }
|
330
|
+
}
|
331
|
+
},
|
332
|
+
:filtering => {
|
333
|
+
:properties => {
|
334
|
+
:domain => { :type => :keyword },
|
335
|
+
:active => {
|
336
|
+
:properties => {
|
337
|
+
:rules => {
|
338
|
+
:properties => {
|
339
|
+
:id => { :type => :keyword },
|
340
|
+
:policy => { :type => :keyword },
|
341
|
+
:field => { :type => :keyword },
|
342
|
+
:rule => { :type => :keyword },
|
343
|
+
:value => { :type => :keyword },
|
344
|
+
:order => { :type => :short },
|
345
|
+
:created_at => { :type => :date },
|
346
|
+
:updated_at => { :type => :date }
|
347
|
+
}
|
348
|
+
},
|
349
|
+
:advanced_snippet => {
|
350
|
+
:properties => {
|
351
|
+
:value => { :type => :object },
|
352
|
+
:created_at => { :type => :date },
|
353
|
+
:updated_at => { :type => :date }
|
354
|
+
}
|
355
|
+
},
|
356
|
+
:validation => {
|
357
|
+
:properties => {
|
358
|
+
:state => { :type => :keyword },
|
359
|
+
:errors => {
|
360
|
+
:properties => {
|
361
|
+
:ids => { :type => :keyword },
|
362
|
+
:messages => { :type => :text }
|
363
|
+
}
|
364
|
+
}
|
365
|
+
}
|
366
|
+
}
|
367
|
+
}
|
368
|
+
},
|
369
|
+
:draft => {
|
370
|
+
:properties => {
|
371
|
+
:rules => {
|
372
|
+
:properties => {
|
373
|
+
:id => { :type => :keyword },
|
374
|
+
:policy => { :type => :keyword },
|
375
|
+
:field => { :type => :keyword },
|
376
|
+
:rule => { :type => :keyword },
|
377
|
+
:value => { :type => :keyword },
|
378
|
+
:order => { :type => :short },
|
379
|
+
:created_at => { :type => :date },
|
380
|
+
:updated_at => { :type => :date }
|
381
|
+
}
|
382
|
+
},
|
383
|
+
:advanced_snippet => {
|
384
|
+
:properties => {
|
385
|
+
:value => { :type => :object },
|
386
|
+
:created_at => { :type => :date },
|
387
|
+
:updated_at => { :type => :date }
|
388
|
+
}
|
389
|
+
},
|
390
|
+
:validation => {
|
391
|
+
:properties => {
|
392
|
+
:state => { :type => :keyword },
|
393
|
+
:errors => {
|
394
|
+
:properties => {
|
395
|
+
:ids => { :type => :keyword },
|
396
|
+
:messages => { :type => :text }
|
397
|
+
}
|
398
|
+
}
|
399
|
+
}
|
400
|
+
}
|
401
|
+
}
|
402
|
+
}
|
403
|
+
}
|
404
|
+
},
|
206
405
|
:index_name => { :type => :keyword },
|
406
|
+
:is_native => { :type => :boolean },
|
407
|
+
:language => { :type => :keyword },
|
207
408
|
:last_seen => { :type => :date },
|
409
|
+
:last_sync_error => { :type => :keyword },
|
410
|
+
:last_sync_status => { :type => :keyword },
|
208
411
|
:last_synced => { :type => :date },
|
209
|
-
:
|
210
|
-
:
|
412
|
+
:last_deleted_document_count => { :type => :long },
|
413
|
+
:last_indexed_document_count => { :type => :long },
|
414
|
+
:name => { :type => :keyword },
|
415
|
+
:pipeline => {
|
416
|
+
:properties => {
|
417
|
+
:extract_binary_content => { :type => :boolean },
|
418
|
+
:name => { :type => :keyword },
|
419
|
+
:reduce_whitespace => { :type => :boolean },
|
420
|
+
:run_ml_inference => { :type => :boolean }
|
421
|
+
}
|
422
|
+
},
|
211
423
|
:scheduling => {
|
212
424
|
:properties => {
|
213
425
|
:enabled => { :type => :boolean },
|
@@ -216,9 +428,7 @@ module Core
|
|
216
428
|
},
|
217
429
|
:service_type => { :type => :keyword },
|
218
430
|
:status => { :type => :keyword },
|
219
|
-
:
|
220
|
-
:sync_now => { :type => :boolean },
|
221
|
-
:sync_status => { :type => :keyword }
|
431
|
+
:sync_now => { :type => :boolean }
|
222
432
|
}
|
223
433
|
}
|
224
434
|
ensure_index_exists("#{Utility::Constants::CONNECTORS_INDEX}-v1", system_index_body(:alias_name => Utility::Constants::CONNECTORS_INDEX, :mappings => mappings))
|
@@ -229,32 +439,112 @@ module Core
|
|
229
439
|
def ensure_job_index_exists
|
230
440
|
mappings = {
|
231
441
|
:properties => {
|
232
|
-
:
|
233
|
-
:
|
442
|
+
:cancelation_requested_at => { :type => :date },
|
443
|
+
:canceled_at => { :type => :date },
|
444
|
+
:completed_at => { :type => :date },
|
445
|
+
:connector => {
|
446
|
+
:properties => {
|
447
|
+
:configuration => { :type => :object },
|
448
|
+
:filtering => {
|
449
|
+
:properties => {
|
450
|
+
:domain => { :type => :keyword },
|
451
|
+
:rules => {
|
452
|
+
:properties => {
|
453
|
+
:id => { :type => :keyword },
|
454
|
+
:policy => { :type => :keyword },
|
455
|
+
:field => { :type => :keyword },
|
456
|
+
:rule => { :type => :keyword },
|
457
|
+
:value => { :type => :keyword },
|
458
|
+
:order => { :type => :short },
|
459
|
+
:created_at => { :type => :date },
|
460
|
+
:updated_at => { :type => :date }
|
461
|
+
}
|
462
|
+
},
|
463
|
+
:advanced_snippet => {
|
464
|
+
:properties => {
|
465
|
+
:value => { :type => :object },
|
466
|
+
:created_at => { :type => :date },
|
467
|
+
:updated_at => { :type => :date }
|
468
|
+
}
|
469
|
+
},
|
470
|
+
:warnings => {
|
471
|
+
:properties => {
|
472
|
+
:ids => { :type => :keyword },
|
473
|
+
:messages => { :type => :text }
|
474
|
+
}
|
475
|
+
}
|
476
|
+
}
|
477
|
+
},
|
478
|
+
:id => { :type => :keyword },
|
479
|
+
:index_name => { :type => :keyword },
|
480
|
+
:language => { :type => :keyword },
|
481
|
+
:pipeline => {
|
482
|
+
:properties => {
|
483
|
+
:extract_binary_content => { :type => :boolean },
|
484
|
+
:name => { :type => :keyword },
|
485
|
+
:reduce_whitespace => { :type => :boolean },
|
486
|
+
:run_ml_inference => { :type => :boolean }
|
487
|
+
}
|
488
|
+
},
|
489
|
+
:service_type => { :type => :keyword }
|
490
|
+
}
|
491
|
+
},
|
492
|
+
:created_at => { :type => :date },
|
493
|
+
:deleted_document_count => { :type => :integer },
|
234
494
|
:error => { :type => :text },
|
235
|
-
:worker_hostname => { :type => :keyword },
|
236
495
|
:indexed_document_count => { :type => :integer },
|
237
|
-
:
|
238
|
-
:
|
239
|
-
:
|
496
|
+
:indexed_document_volume => { :type => :integer },
|
497
|
+
:last_seen => { :type => :date },
|
498
|
+
:metadata => { :type => :object },
|
499
|
+
:started_at => { :type => :date },
|
500
|
+
:status => { :type => :keyword },
|
501
|
+
:total_document_count => { :type => :integer },
|
502
|
+
:trigger_method => { :type => :keyword },
|
503
|
+
:worker_hostname => { :type => :keyword }
|
240
504
|
}
|
241
505
|
}
|
242
506
|
ensure_index_exists("#{Utility::Constants::JOB_INDEX}-v1", system_index_body(:alias_name => Utility::Constants::JOB_INDEX, :mappings => mappings))
|
243
507
|
end
|
244
508
|
|
245
|
-
def update_connector_fields(connector_id, doc = {})
|
509
|
+
def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
|
246
510
|
return if doc.empty?
|
247
|
-
|
511
|
+
update_args = {
|
248
512
|
:index => Utility::Constants::CONNECTORS_INDEX,
|
249
513
|
:id => connector_id,
|
250
514
|
:body => { :doc => doc },
|
251
515
|
:refresh => true,
|
252
516
|
:retry_on_conflict => 3
|
253
|
-
|
517
|
+
}
|
518
|
+
# seq_no and primary_term are used for optimistic concurrency control
|
519
|
+
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
|
520
|
+
if seq_no && primary_term
|
521
|
+
update_args[:if_seq_no] = seq_no
|
522
|
+
update_args[:if_primary_term] = primary_term
|
523
|
+
update_args.delete(:retry_on_conflict)
|
524
|
+
end
|
525
|
+
begin
|
526
|
+
client.update(update_args)
|
527
|
+
rescue Elastic::Transport::Transport::Errors::Conflict
|
528
|
+
# VersionConflictException
|
529
|
+
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
|
530
|
+
raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
|
531
|
+
end
|
532
|
+
end
|
533
|
+
|
534
|
+
def document_count(index_name)
|
535
|
+
client.count(:index => index_name)['count']
|
254
536
|
end
|
255
537
|
|
256
538
|
private
|
257
539
|
|
540
|
+
def should_update_validations?(domain_validations, filtering)
|
541
|
+
domains_present = filtering.collect { |filter| filter[:domain] }
|
542
|
+
domains_to_update = domain_validations.keys
|
543
|
+
|
544
|
+
# non-empty intersection -> domains to update present
|
545
|
+
!(domains_present & domains_to_update).empty?
|
546
|
+
end
|
547
|
+
|
258
548
|
def client
|
259
549
|
@client ||= Utility::EsClient.new(App::Config[:elasticsearch])
|
260
550
|
end
|
@@ -264,6 +554,15 @@ module Core
|
|
264
554
|
index_version = index_versions.max # gets the largest suffix number
|
265
555
|
"#{alias_name}-v#{index_version}"
|
266
556
|
end
|
557
|
+
|
558
|
+
def update_filter_validation(filter, domain_validations)
|
559
|
+
domain = filter[:domain]
|
560
|
+
|
561
|
+
if domain_validations.key?(domain)
|
562
|
+
new_validation_state = { :draft => { :validation => domain_validations[domain] } }
|
563
|
+
filter.deep_merge!(new_validation_state)
|
564
|
+
end
|
565
|
+
end
|
267
566
|
end
|
268
567
|
end
|
269
568
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'core/filtering'
|
10
|
+
require 'utility/filtering'
|
11
|
+
|
12
|
+
module Core
|
13
|
+
module Filtering
|
14
|
+
class PostProcessEngine
|
15
|
+
attr_reader :rules
|
16
|
+
|
17
|
+
def initialize(job_description)
|
18
|
+
@rules = ordered_rules(job_description.dig('connector', 'filtering'))
|
19
|
+
end
|
20
|
+
|
21
|
+
def process(document)
|
22
|
+
@rules.each do |rule|
|
23
|
+
if rule.match?(document.stringify_keys)
|
24
|
+
return PostProcessResult.new(document, rule)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
PostProcessResult.new(document, SimpleRule::DEFAULT_RULE)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def ordered_rules(job_filtering)
|
33
|
+
job_rules = Utility::Filtering.extract_filter(job_filtering)['rules']
|
34
|
+
sorted_rules = job_rules.sort_by { |rule| rule['order'] }.reject { |rule| rule['id'] == Core::Filtering::SimpleRule::DEFAULT_RULE_ID }
|
35
|
+
sorted_rules.each_with_object([]) { |rule, output| output << SimpleRule.new(rule) }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'utility/logger'
|
10
|
+
|
11
|
+
module Core
|
12
|
+
module Filtering
|
13
|
+
class PostProcessResult
|
14
|
+
attr_reader :document, :matching_rule
|
15
|
+
|
16
|
+
def initialize(document, matching_rule)
|
17
|
+
@document = document
|
18
|
+
@matching_rule = matching_rule
|
19
|
+
Utility::Logger.debug("Document '#{document['id']}' matched filtering rule: #{matching_rule.id}. It will be #{matching_rule.policy}d")
|
20
|
+
end
|
21
|
+
|
22
|
+
def is_include?
|
23
|
+
matching_rule.is_include?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'utility/logger'
|
10
|
+
|
11
|
+
module Core
|
12
|
+
module Filtering
|
13
|
+
class SimpleRule
|
14
|
+
DEFAULT_RULE_ID = 'DEFAULT'
|
15
|
+
|
16
|
+
class Policy
|
17
|
+
INCLUDE = 'include'
|
18
|
+
EXCLUDE = 'exclude'
|
19
|
+
end
|
20
|
+
|
21
|
+
class Rule
|
22
|
+
REGEX = 'regex'
|
23
|
+
EQUALS = 'equals'
|
24
|
+
STARTS_WITH = 'starts_with'
|
25
|
+
ENDS_WITH = 'ends_with'
|
26
|
+
CONTAINS = 'contains'
|
27
|
+
LESS_THAN = '<'
|
28
|
+
GREATER_THAN = '>'
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_reader :policy, :field, :rule, :value, :id
|
32
|
+
|
33
|
+
def initialize(rule_hash)
|
34
|
+
@policy = rule_hash.fetch('policy')
|
35
|
+
@field = rule_hash.fetch('field')
|
36
|
+
@rule = rule_hash.fetch('rule')
|
37
|
+
@value = rule_hash.fetch('value')
|
38
|
+
@id = rule_hash.fetch('id')
|
39
|
+
@rule_hash = rule_hash
|
40
|
+
rescue KeyError => e
|
41
|
+
raise "#{e.key} is required"
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.from_args(id, policy, field, rule, value)
|
45
|
+
SimpleRule.new(
|
46
|
+
{
|
47
|
+
'id' => id,
|
48
|
+
'policy' => policy,
|
49
|
+
'field' => field,
|
50
|
+
'rule' => rule,
|
51
|
+
'value' => value
|
52
|
+
}
|
53
|
+
)
|
54
|
+
end
|
55
|
+
|
56
|
+
DEFAULT_RULE = SimpleRule.new(
|
57
|
+
'policy' => 'include',
|
58
|
+
'field' => '_',
|
59
|
+
'rule' => 'regex',
|
60
|
+
'value' => '.*',
|
61
|
+
'id' => SimpleRule::DEFAULT_RULE_ID
|
62
|
+
)
|
63
|
+
|
64
|
+
def match?(document)
|
65
|
+
return true if id == DEFAULT_RULE_ID
|
66
|
+
doc_value = document[field]
|
67
|
+
return false if doc_value.nil?
|
68
|
+
coerced_value = coerce(doc_value)
|
69
|
+
case rule
|
70
|
+
when Rule::EQUALS
|
71
|
+
case coerced_value
|
72
|
+
when Integer
|
73
|
+
doc_value == coerced_value
|
74
|
+
when DateTime, Time
|
75
|
+
doc_value.to_s == coerced_value.to_s
|
76
|
+
else
|
77
|
+
doc_value.to_s == coerced_value
|
78
|
+
end
|
79
|
+
when Rule::STARTS_WITH
|
80
|
+
doc_value.to_s.start_with?(value)
|
81
|
+
when Rule::ENDS_WITH
|
82
|
+
doc_value.to_s.end_with?(value)
|
83
|
+
when Rule::CONTAINS
|
84
|
+
doc_value.to_s.include?(value)
|
85
|
+
when Rule::REGEX
|
86
|
+
doc_value.to_s.match(/#{value}/)
|
87
|
+
when Rule::LESS_THAN
|
88
|
+
doc_value < coerced_value
|
89
|
+
when Rule::GREATER_THAN
|
90
|
+
doc_value > coerced_value
|
91
|
+
else
|
92
|
+
false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def coerce(doc_value)
|
97
|
+
case doc_value
|
98
|
+
when String
|
99
|
+
value.to_s
|
100
|
+
when Integer
|
101
|
+
value.to_i
|
102
|
+
when DateTime, Time
|
103
|
+
to_date(value)
|
104
|
+
when TrueClass, FalseClass # Ruby doesn't have a Boolean type, TIL
|
105
|
+
to_bool(value).to_s
|
106
|
+
else
|
107
|
+
value.to_s
|
108
|
+
end
|
109
|
+
rescue StandardError => e
|
110
|
+
Utility::Logger.debug("Failed to coerce value '#{value}' (#{value.class}) based on document value '#{doc_value}' (#{doc_value.class}) due to error: #{e.class}: #{e.message}")
|
111
|
+
value.to_s
|
112
|
+
end
|
113
|
+
|
114
|
+
def is_include?
|
115
|
+
policy == Policy::INCLUDE
|
116
|
+
end
|
117
|
+
|
118
|
+
def is_exclude?
|
119
|
+
policy == Policy::EXCLUDE
|
120
|
+
end
|
121
|
+
|
122
|
+
def to_h
|
123
|
+
@rule_hash
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
def to_bool(str)
|
129
|
+
return true if str == true || str =~ (/^(true|t|yes|y|on|1)$/i)
|
130
|
+
return false if str == false || str.blank? || str =~ (/^(false|f|no|n|off|0)$/i)
|
131
|
+
raise ArgumentError.new("invalid value for Boolean: \"#{str}\"")
|
132
|
+
end
|
133
|
+
|
134
|
+
def to_date(str)
|
135
|
+
DateTime.parse(str)
|
136
|
+
rescue ArgumentError
|
137
|
+
Time.at(str.to_i) # try with it as an int string of millis
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|