connectors_service 8.6.0.3 → 8.6.0.4.pre.20221114T233727Z
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/config/connectors.yml +9 -10
- data/lib/app/config.rb +2 -0
- data/lib/app/dispatcher.rb +17 -1
- data/lib/app/preflight_check.rb +15 -0
- data/lib/connectors/base/connector.rb +37 -4
- data/lib/connectors/base/simple_rules_parser.rb +42 -0
- data/lib/connectors/connector_status.rb +4 -4
- data/lib/connectors/example/{example_attachments → attachments}/first_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/second_attachment.txt +0 -0
- data/lib/connectors/example/{example_attachments → attachments}/third_attachment.txt +0 -0
- data/lib/connectors/example/connector.rb +43 -4
- data/lib/connectors/gitlab/connector.rb +16 -2
- data/lib/connectors/mongodb/connector.rb +173 -50
- data/lib/connectors/mongodb/mongo_rules_parser.rb +81 -0
- data/lib/connectors/registry.rb +2 -2
- data/lib/connectors/sync_status.rb +23 -4
- data/lib/core/configuration.rb +4 -2
- data/lib/core/connector_job.rb +137 -0
- data/lib/core/connector_settings.rb +29 -18
- data/lib/core/elastic_connector_actions.rb +331 -32
- data/lib/core/filtering/post_process_engine.rb +39 -0
- data/lib/core/filtering/post_process_result.rb +27 -0
- data/lib/core/filtering/simple_rule.rb +141 -0
- data/lib/core/filtering/validation_job_runner.rb +53 -0
- data/lib/{connectors_app/// → core/filtering/validation_status.rb} +9 -5
- data/lib/core/filtering.rb +17 -0
- data/lib/core/ingestion/es_sink.rb +59 -0
- data/lib/core/ingestion/ingester.rb +90 -0
- data/lib/core/{output_sink.rb → ingestion.rb} +2 -5
- data/lib/core/native_scheduler.rb +3 -0
- data/lib/core/scheduler.rb +43 -10
- data/lib/core/single_scheduler.rb +3 -0
- data/lib/core/sync_job_runner.rb +78 -18
- data/lib/core.rb +2 -0
- data/lib/utility/bulk_queue.rb +85 -0
- data/lib/utility/common.rb +20 -0
- data/lib/utility/constants.rb +2 -0
- data/lib/utility/errors.rb +5 -0
- data/lib/utility/es_client.rb +6 -2
- data/lib/utility/filtering.rb +22 -0
- data/lib/utility/logger.rb +2 -1
- data/lib/utility.rb +5 -3
- metadata +27 -18
- data/lib/core/output_sink/base_sink.rb +0 -33
- data/lib/core/output_sink/combined_sink.rb +0 -38
- data/lib/core/output_sink/console_sink.rb +0 -51
- data/lib/core/output_sink/es_sink.rb +0 -74
@@ -10,8 +10,27 @@ require 'active_support/core_ext/hash'
|
|
10
10
|
require 'connectors/connector_status'
|
11
11
|
require 'connectors/sync_status'
|
12
12
|
require 'utility'
|
13
|
+
require 'elastic-transport'
|
13
14
|
|
14
15
|
module Core
|
16
|
+
class JobAlreadyRunningError < StandardError
|
17
|
+
def initialize(connector_id)
|
18
|
+
super("Sync job for connector '#{connector_id}' is already running.")
|
19
|
+
end
|
20
|
+
end
|
21
|
+
|
22
|
+
class JobNotCreatedError < StandardError
|
23
|
+
def initialize(connector_id, response)
|
24
|
+
super("Sync job for connector '#{connector_id}' could not be created. Response: #{response}")
|
25
|
+
end
|
26
|
+
end
|
27
|
+
|
28
|
+
class ConnectorVersionChangedError < StandardError
|
29
|
+
def initialize(connector_id, seq_no, primary_term)
|
30
|
+
super("Version conflict: seq_no [#{seq_no}] and primary_term [#{primary_term}] do not match for connector '#{connector_id}'.")
|
31
|
+
end
|
32
|
+
end
|
33
|
+
|
15
34
|
class ElasticConnectorActions
|
16
35
|
class << self
|
17
36
|
|
@@ -30,10 +49,17 @@ module Core
|
|
30
49
|
end
|
31
50
|
|
32
51
|
def get_connector(connector_id)
|
52
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
33
53
|
client.get(:index => Utility::Constants::CONNECTORS_INDEX, :id => connector_id, :ignore => 404).with_indifferent_access
|
34
54
|
end
|
35
55
|
|
56
|
+
def get_job(job_id)
|
57
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
58
|
+
client.get(:index => Utility::Constants::JOB_INDEX, :id => job_id, :ignore => 404).with_indifferent_access
|
59
|
+
end
|
60
|
+
|
36
61
|
def connectors_meta
|
62
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
37
63
|
alias_mappings = client.indices.get_mapping(:index => Utility::Constants::CONNECTORS_INDEX).with_indifferent_access
|
38
64
|
index = get_latest_index_in_alias(Utility::Constants::CONNECTORS_INDEX, alias_mappings.keys)
|
39
65
|
alias_mappings.dig(index, 'mappings', '_meta') || {}
|
@@ -52,6 +78,19 @@ module Core
|
|
52
78
|
)
|
53
79
|
end
|
54
80
|
|
81
|
+
def search_jobs(query, page_size, offset)
|
82
|
+
client.search(
|
83
|
+
:index => Utility::Constants::JOB_INDEX,
|
84
|
+
:ignore => 404,
|
85
|
+
:body => {
|
86
|
+
:size => page_size,
|
87
|
+
:from => offset,
|
88
|
+
:query => query,
|
89
|
+
:sort => ['created_at']
|
90
|
+
}
|
91
|
+
)
|
92
|
+
end
|
93
|
+
|
55
94
|
def update_connector_configuration(connector_id, configuration)
|
56
95
|
update_connector_fields(connector_id, :configuration => configuration)
|
57
96
|
end
|
@@ -71,21 +110,90 @@ module Core
|
|
71
110
|
update_connector_configuration(connector_id, payload)
|
72
111
|
end
|
73
112
|
|
113
|
+
def update_filtering_validation(connector_id, filter_validation_results)
|
114
|
+
return if filter_validation_results.empty?
|
115
|
+
|
116
|
+
filtering = get_connector(connector_id).dig(:_source, :filtering)
|
117
|
+
|
118
|
+
case filtering
|
119
|
+
when Hash
|
120
|
+
update_filter_validation(filtering, filter_validation_results)
|
121
|
+
when Array
|
122
|
+
return unless should_update_validations?(filter_validation_results, filtering)
|
123
|
+
|
124
|
+
filtering.each do |filter|
|
125
|
+
update_filter_validation(filter, filter_validation_results)
|
126
|
+
end
|
127
|
+
else
|
128
|
+
Utility::Logger.warn("Elasticsearch returned invalid filtering format: #{filtering}. Skipping validation.")
|
129
|
+
return
|
130
|
+
end
|
131
|
+
|
132
|
+
update_connector_fields(connector_id, { :filtering => filtering })
|
133
|
+
end
|
134
|
+
|
74
135
|
def claim_job(connector_id)
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
136
|
+
seq_no = nil
|
137
|
+
primary_term = nil
|
138
|
+
sync_in_progress = false
|
139
|
+
connector_record = client.get(
|
140
|
+
:index => Utility::Constants::CONNECTORS_INDEX,
|
141
|
+
:id => connector_id,
|
142
|
+
:ignore => 404,
|
143
|
+
:refresh => true
|
144
|
+
).tap do |response|
|
145
|
+
seq_no = response['_seq_no']
|
146
|
+
primary_term = response['_primary_term']
|
147
|
+
sync_in_progress = response.dig('_source', 'last_sync_status') == Connectors::SyncStatus::IN_PROGRESS
|
148
|
+
end
|
149
|
+
if sync_in_progress
|
150
|
+
raise JobAlreadyRunningError.new(connector_id)
|
151
|
+
end
|
152
|
+
update_connector_fields(
|
153
|
+
connector_id,
|
154
|
+
{ :sync_now => false,
|
155
|
+
:last_sync_status => Connectors::SyncStatus::IN_PROGRESS,
|
156
|
+
:last_synced => Time.now },
|
157
|
+
seq_no,
|
158
|
+
primary_term
|
159
|
+
)
|
79
160
|
|
80
161
|
body = {
|
81
|
-
:connector_id => connector_id,
|
82
162
|
:status => Connectors::SyncStatus::IN_PROGRESS,
|
83
163
|
:worker_hostname => Socket.gethostname,
|
84
|
-
:created_at => Time.now
|
164
|
+
:created_at => Time.now,
|
165
|
+
:started_at => Time.now,
|
166
|
+
:last_seen => Time.now,
|
167
|
+
:connector => {
|
168
|
+
:id => connector_id,
|
169
|
+
:filtering => convert_connector_filtering_to_job_filtering(connector_record.dig('_source', 'filtering'))
|
170
|
+
}
|
85
171
|
}
|
86
|
-
job = client.index(:index => Utility::Constants::JOB_INDEX, :body => body)
|
87
172
|
|
88
|
-
|
173
|
+
index_response = client.index(:index => Utility::Constants::JOB_INDEX, :body => body, :refresh => true)
|
174
|
+
if index_response['result'] == 'created'
|
175
|
+
# TODO: remove the usage of with_indifferent_access. Ideally this should return a hash or nil if not found
|
176
|
+
return client.get(
|
177
|
+
:index => Utility::Constants::JOB_INDEX,
|
178
|
+
:id => index_response['_id'],
|
179
|
+
:ignore => 404
|
180
|
+
).with_indifferent_access
|
181
|
+
end
|
182
|
+
raise JobNotCreatedError.new(connector_id, index_response)
|
183
|
+
end
|
184
|
+
|
185
|
+
def convert_connector_filtering_to_job_filtering(connector_filtering)
|
186
|
+
return [] unless connector_filtering
|
187
|
+
connector_filtering = [connector_filtering] unless connector_filtering.is_a?(Array)
|
188
|
+
connector_filtering.each_with_object([]) do |filtering_domain, job_filtering|
|
189
|
+
snippet = filtering_domain.dig('active', 'advanced_snippet') || {}
|
190
|
+
job_filtering << {
|
191
|
+
'domain' => filtering_domain['domain'],
|
192
|
+
'rules' => filtering_domain.dig('active', 'rules'),
|
193
|
+
'advanced_snippet' => snippet['value'] || snippet,
|
194
|
+
'warnings' => [] # TODO: in https://github.com/elastic/enterprise-search-team/issues/3174
|
195
|
+
}
|
196
|
+
end
|
89
197
|
end
|
90
198
|
|
91
199
|
def update_connector_status(connector_id, status, error_message = nil)
|
@@ -99,22 +207,33 @@ module Core
|
|
99
207
|
update_connector_fields(connector_id, body)
|
100
208
|
end
|
101
209
|
|
102
|
-
def
|
103
|
-
|
210
|
+
def update_sync(job_id, metadata)
|
211
|
+
body = {
|
212
|
+
:doc => { :last_seen => Time.now }.merge(metadata)
|
213
|
+
}
|
214
|
+
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
215
|
+
end
|
216
|
+
|
217
|
+
def complete_sync(connector_id, job_id, metadata, error)
|
218
|
+
sync_status = error ? Connectors::SyncStatus::ERROR : Connectors::SyncStatus::COMPLETED
|
219
|
+
|
220
|
+
metadata ||= {}
|
104
221
|
|
105
222
|
update_connector_fields(connector_id,
|
106
223
|
:last_sync_status => sync_status,
|
107
|
-
:last_sync_error =>
|
108
|
-
:error =>
|
224
|
+
:last_sync_error => error,
|
225
|
+
:error => error,
|
109
226
|
:last_synced => Time.now,
|
110
|
-
:last_indexed_document_count =>
|
111
|
-
:last_deleted_document_count =>
|
227
|
+
:last_indexed_document_count => metadata[:indexed_document_count],
|
228
|
+
:last_deleted_document_count => metadata[:deleted_document_count])
|
112
229
|
|
113
230
|
body = {
|
114
231
|
:doc => {
|
115
232
|
:status => sync_status,
|
116
|
-
:completed_at => Time.now
|
117
|
-
|
233
|
+
:completed_at => Time.now,
|
234
|
+
:last_seen => Time.now,
|
235
|
+
:error => error
|
236
|
+
}.merge(metadata)
|
118
237
|
}
|
119
238
|
client.update(:index => Utility::Constants::JOB_INDEX, :id => job_id, :body => body)
|
120
239
|
end
|
@@ -136,7 +255,7 @@ module Core
|
|
136
255
|
}
|
137
256
|
loop do
|
138
257
|
response = client.search(:body => body)
|
139
|
-
hits = response
|
258
|
+
hits = response.dig('hits', 'hits') || []
|
140
259
|
|
141
260
|
ids = hits.map { |h| h['_id'] }
|
142
261
|
result += ids
|
@@ -202,12 +321,105 @@ module Core
|
|
202
321
|
:properties => {
|
203
322
|
:api_key_id => { :type => :keyword },
|
204
323
|
:configuration => { :type => :object },
|
205
|
-
:
|
324
|
+
:description => { :type => :text },
|
325
|
+
:error => { :type => :keyword },
|
326
|
+
:features => {
|
327
|
+
:properties => {
|
328
|
+
:filtering_advanced_config => { :type => :boolean },
|
329
|
+
:filtering_rules => { :type => :boolean }
|
330
|
+
}
|
331
|
+
},
|
332
|
+
:filtering => {
|
333
|
+
:properties => {
|
334
|
+
:domain => { :type => :keyword },
|
335
|
+
:active => {
|
336
|
+
:properties => {
|
337
|
+
:rules => {
|
338
|
+
:properties => {
|
339
|
+
:id => { :type => :keyword },
|
340
|
+
:policy => { :type => :keyword },
|
341
|
+
:field => { :type => :keyword },
|
342
|
+
:rule => { :type => :keyword },
|
343
|
+
:value => { :type => :keyword },
|
344
|
+
:order => { :type => :short },
|
345
|
+
:created_at => { :type => :date },
|
346
|
+
:updated_at => { :type => :date }
|
347
|
+
}
|
348
|
+
},
|
349
|
+
:advanced_snippet => {
|
350
|
+
:properties => {
|
351
|
+
:value => { :type => :object },
|
352
|
+
:created_at => { :type => :date },
|
353
|
+
:updated_at => { :type => :date }
|
354
|
+
}
|
355
|
+
},
|
356
|
+
:validation => {
|
357
|
+
:properties => {
|
358
|
+
:state => { :type => :keyword },
|
359
|
+
:errors => {
|
360
|
+
:properties => {
|
361
|
+
:ids => { :type => :keyword },
|
362
|
+
:messages => { :type => :text }
|
363
|
+
}
|
364
|
+
}
|
365
|
+
}
|
366
|
+
}
|
367
|
+
}
|
368
|
+
},
|
369
|
+
:draft => {
|
370
|
+
:properties => {
|
371
|
+
:rules => {
|
372
|
+
:properties => {
|
373
|
+
:id => { :type => :keyword },
|
374
|
+
:policy => { :type => :keyword },
|
375
|
+
:field => { :type => :keyword },
|
376
|
+
:rule => { :type => :keyword },
|
377
|
+
:value => { :type => :keyword },
|
378
|
+
:order => { :type => :short },
|
379
|
+
:created_at => { :type => :date },
|
380
|
+
:updated_at => { :type => :date }
|
381
|
+
}
|
382
|
+
},
|
383
|
+
:advanced_snippet => {
|
384
|
+
:properties => {
|
385
|
+
:value => { :type => :object },
|
386
|
+
:created_at => { :type => :date },
|
387
|
+
:updated_at => { :type => :date }
|
388
|
+
}
|
389
|
+
},
|
390
|
+
:validation => {
|
391
|
+
:properties => {
|
392
|
+
:state => { :type => :keyword },
|
393
|
+
:errors => {
|
394
|
+
:properties => {
|
395
|
+
:ids => { :type => :keyword },
|
396
|
+
:messages => { :type => :text }
|
397
|
+
}
|
398
|
+
}
|
399
|
+
}
|
400
|
+
}
|
401
|
+
}
|
402
|
+
}
|
403
|
+
}
|
404
|
+
},
|
206
405
|
:index_name => { :type => :keyword },
|
406
|
+
:is_native => { :type => :boolean },
|
407
|
+
:language => { :type => :keyword },
|
207
408
|
:last_seen => { :type => :date },
|
409
|
+
:last_sync_error => { :type => :keyword },
|
410
|
+
:last_sync_status => { :type => :keyword },
|
208
411
|
:last_synced => { :type => :date },
|
209
|
-
:
|
210
|
-
:
|
412
|
+
:last_deleted_document_count => { :type => :long },
|
413
|
+
:last_indexed_document_count => { :type => :long },
|
414
|
+
:name => { :type => :keyword },
|
415
|
+
:pipeline => {
|
416
|
+
:properties => {
|
417
|
+
:extract_binary_content => { :type => :boolean },
|
418
|
+
:name => { :type => :keyword },
|
419
|
+
:reduce_whitespace => { :type => :boolean },
|
420
|
+
:run_ml_inference => { :type => :boolean }
|
421
|
+
}
|
422
|
+
},
|
211
423
|
:scheduling => {
|
212
424
|
:properties => {
|
213
425
|
:enabled => { :type => :boolean },
|
@@ -216,9 +428,7 @@ module Core
|
|
216
428
|
},
|
217
429
|
:service_type => { :type => :keyword },
|
218
430
|
:status => { :type => :keyword },
|
219
|
-
:
|
220
|
-
:sync_now => { :type => :boolean },
|
221
|
-
:sync_status => { :type => :keyword }
|
431
|
+
:sync_now => { :type => :boolean }
|
222
432
|
}
|
223
433
|
}
|
224
434
|
ensure_index_exists("#{Utility::Constants::CONNECTORS_INDEX}-v1", system_index_body(:alias_name => Utility::Constants::CONNECTORS_INDEX, :mappings => mappings))
|
@@ -229,32 +439,112 @@ module Core
|
|
229
439
|
def ensure_job_index_exists
|
230
440
|
mappings = {
|
231
441
|
:properties => {
|
232
|
-
:
|
233
|
-
:
|
442
|
+
:cancelation_requested_at => { :type => :date },
|
443
|
+
:canceled_at => { :type => :date },
|
444
|
+
:completed_at => { :type => :date },
|
445
|
+
:connector => {
|
446
|
+
:properties => {
|
447
|
+
:configuration => { :type => :object },
|
448
|
+
:filtering => {
|
449
|
+
:properties => {
|
450
|
+
:domain => { :type => :keyword },
|
451
|
+
:rules => {
|
452
|
+
:properties => {
|
453
|
+
:id => { :type => :keyword },
|
454
|
+
:policy => { :type => :keyword },
|
455
|
+
:field => { :type => :keyword },
|
456
|
+
:rule => { :type => :keyword },
|
457
|
+
:value => { :type => :keyword },
|
458
|
+
:order => { :type => :short },
|
459
|
+
:created_at => { :type => :date },
|
460
|
+
:updated_at => { :type => :date }
|
461
|
+
}
|
462
|
+
},
|
463
|
+
:advanced_snippet => {
|
464
|
+
:properties => {
|
465
|
+
:value => { :type => :object },
|
466
|
+
:created_at => { :type => :date },
|
467
|
+
:updated_at => { :type => :date }
|
468
|
+
}
|
469
|
+
},
|
470
|
+
:warnings => {
|
471
|
+
:properties => {
|
472
|
+
:ids => { :type => :keyword },
|
473
|
+
:messages => { :type => :text }
|
474
|
+
}
|
475
|
+
}
|
476
|
+
}
|
477
|
+
},
|
478
|
+
:id => { :type => :keyword },
|
479
|
+
:index_name => { :type => :keyword },
|
480
|
+
:language => { :type => :keyword },
|
481
|
+
:pipeline => {
|
482
|
+
:properties => {
|
483
|
+
:extract_binary_content => { :type => :boolean },
|
484
|
+
:name => { :type => :keyword },
|
485
|
+
:reduce_whitespace => { :type => :boolean },
|
486
|
+
:run_ml_inference => { :type => :boolean }
|
487
|
+
}
|
488
|
+
},
|
489
|
+
:service_type => { :type => :keyword }
|
490
|
+
}
|
491
|
+
},
|
492
|
+
:created_at => { :type => :date },
|
493
|
+
:deleted_document_count => { :type => :integer },
|
234
494
|
:error => { :type => :text },
|
235
|
-
:worker_hostname => { :type => :keyword },
|
236
495
|
:indexed_document_count => { :type => :integer },
|
237
|
-
:
|
238
|
-
:
|
239
|
-
:
|
496
|
+
:indexed_document_volume => { :type => :integer },
|
497
|
+
:last_seen => { :type => :date },
|
498
|
+
:metadata => { :type => :object },
|
499
|
+
:started_at => { :type => :date },
|
500
|
+
:status => { :type => :keyword },
|
501
|
+
:total_document_count => { :type => :integer },
|
502
|
+
:trigger_method => { :type => :keyword },
|
503
|
+
:worker_hostname => { :type => :keyword }
|
240
504
|
}
|
241
505
|
}
|
242
506
|
ensure_index_exists("#{Utility::Constants::JOB_INDEX}-v1", system_index_body(:alias_name => Utility::Constants::JOB_INDEX, :mappings => mappings))
|
243
507
|
end
|
244
508
|
|
245
|
-
def update_connector_fields(connector_id, doc = {})
|
509
|
+
def update_connector_fields(connector_id, doc = {}, seq_no = nil, primary_term = nil)
|
246
510
|
return if doc.empty?
|
247
|
-
|
511
|
+
update_args = {
|
248
512
|
:index => Utility::Constants::CONNECTORS_INDEX,
|
249
513
|
:id => connector_id,
|
250
514
|
:body => { :doc => doc },
|
251
515
|
:refresh => true,
|
252
516
|
:retry_on_conflict => 3
|
253
|
-
|
517
|
+
}
|
518
|
+
# seq_no and primary_term are used for optimistic concurrency control
|
519
|
+
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/optimistic-concurrency-control.html
|
520
|
+
if seq_no && primary_term
|
521
|
+
update_args[:if_seq_no] = seq_no
|
522
|
+
update_args[:if_primary_term] = primary_term
|
523
|
+
update_args.delete(:retry_on_conflict)
|
524
|
+
end
|
525
|
+
begin
|
526
|
+
client.update(update_args)
|
527
|
+
rescue Elastic::Transport::Transport::Errors::Conflict
|
528
|
+
# VersionConflictException
|
529
|
+
# see https://www.elastic.co/guide/en/elasticsearch/reference/current/docs-index_.html#optimistic-concurrency-control-index
|
530
|
+
raise ConnectorVersionChangedError.new(connector_id, seq_no, primary_term)
|
531
|
+
end
|
532
|
+
end
|
533
|
+
|
534
|
+
def document_count(index_name)
|
535
|
+
client.count(:index => index_name)['count']
|
254
536
|
end
|
255
537
|
|
256
538
|
private
|
257
539
|
|
540
|
+
def should_update_validations?(domain_validations, filtering)
|
541
|
+
domains_present = filtering.collect { |filter| filter[:domain] }
|
542
|
+
domains_to_update = domain_validations.keys
|
543
|
+
|
544
|
+
# non-empty intersection -> domains to update present
|
545
|
+
!(domains_present & domains_to_update).empty?
|
546
|
+
end
|
547
|
+
|
258
548
|
def client
|
259
549
|
@client ||= Utility::EsClient.new(App::Config[:elasticsearch])
|
260
550
|
end
|
@@ -264,6 +554,15 @@ module Core
|
|
264
554
|
index_version = index_versions.max # gets the largest suffix number
|
265
555
|
"#{alias_name}-v#{index_version}"
|
266
556
|
end
|
557
|
+
|
558
|
+
def update_filter_validation(filter, domain_validations)
|
559
|
+
domain = filter[:domain]
|
560
|
+
|
561
|
+
if domain_validations.key?(domain)
|
562
|
+
new_validation_state = { :draft => { :validation => domain_validations[domain] } }
|
563
|
+
filter.deep_merge!(new_validation_state)
|
564
|
+
end
|
565
|
+
end
|
267
566
|
end
|
268
567
|
end
|
269
568
|
end
|
@@ -0,0 +1,39 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'core/filtering'
|
10
|
+
require 'utility/filtering'
|
11
|
+
|
12
|
+
module Core
|
13
|
+
module Filtering
|
14
|
+
class PostProcessEngine
|
15
|
+
attr_reader :rules
|
16
|
+
|
17
|
+
def initialize(job_description)
|
18
|
+
@rules = ordered_rules(job_description.dig('connector', 'filtering'))
|
19
|
+
end
|
20
|
+
|
21
|
+
def process(document)
|
22
|
+
@rules.each do |rule|
|
23
|
+
if rule.match?(document.stringify_keys)
|
24
|
+
return PostProcessResult.new(document, rule)
|
25
|
+
end
|
26
|
+
end
|
27
|
+
PostProcessResult.new(document, SimpleRule::DEFAULT_RULE)
|
28
|
+
end
|
29
|
+
|
30
|
+
private
|
31
|
+
|
32
|
+
def ordered_rules(job_filtering)
|
33
|
+
job_rules = Utility::Filtering.extract_filter(job_filtering)['rules']
|
34
|
+
sorted_rules = job_rules.sort_by { |rule| rule['order'] }.reject { |rule| rule['id'] == Core::Filtering::SimpleRule::DEFAULT_RULE_ID }
|
35
|
+
sorted_rules.each_with_object([]) { |rule, output| output << SimpleRule.new(rule) }
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
39
|
+
end
|
@@ -0,0 +1,27 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'utility/logger'
|
10
|
+
|
11
|
+
module Core
|
12
|
+
module Filtering
|
13
|
+
class PostProcessResult
|
14
|
+
attr_reader :document, :matching_rule
|
15
|
+
|
16
|
+
def initialize(document, matching_rule)
|
17
|
+
@document = document
|
18
|
+
@matching_rule = matching_rule
|
19
|
+
Utility::Logger.debug("Document '#{document['id']}' matched filtering rule: #{matching_rule.id}. It will be #{matching_rule.policy}d")
|
20
|
+
end
|
21
|
+
|
22
|
+
def is_include?
|
23
|
+
matching_rule.is_include?
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
27
|
+
end
|
@@ -0,0 +1,141 @@
|
|
1
|
+
#
|
2
|
+
# Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
|
3
|
+
# or more contributor license agreements. Licensed under the Elastic License;
|
4
|
+
# you may not use this file except in compliance with the Elastic License.
|
5
|
+
#
|
6
|
+
|
7
|
+
# frozen_string_literal: true
|
8
|
+
|
9
|
+
require 'utility/logger'
|
10
|
+
|
11
|
+
module Core
|
12
|
+
module Filtering
|
13
|
+
class SimpleRule
|
14
|
+
DEFAULT_RULE_ID = 'DEFAULT'
|
15
|
+
|
16
|
+
class Policy
|
17
|
+
INCLUDE = 'include'
|
18
|
+
EXCLUDE = 'exclude'
|
19
|
+
end
|
20
|
+
|
21
|
+
class Rule
|
22
|
+
REGEX = 'regex'
|
23
|
+
EQUALS = 'equals'
|
24
|
+
STARTS_WITH = 'starts_with'
|
25
|
+
ENDS_WITH = 'ends_with'
|
26
|
+
CONTAINS = 'contains'
|
27
|
+
LESS_THAN = '<'
|
28
|
+
GREATER_THAN = '>'
|
29
|
+
end
|
30
|
+
|
31
|
+
attr_reader :policy, :field, :rule, :value, :id
|
32
|
+
|
33
|
+
def initialize(rule_hash)
|
34
|
+
@policy = rule_hash.fetch('policy')
|
35
|
+
@field = rule_hash.fetch('field')
|
36
|
+
@rule = rule_hash.fetch('rule')
|
37
|
+
@value = rule_hash.fetch('value')
|
38
|
+
@id = rule_hash.fetch('id')
|
39
|
+
@rule_hash = rule_hash
|
40
|
+
rescue KeyError => e
|
41
|
+
raise "#{e.key} is required"
|
42
|
+
end
|
43
|
+
|
44
|
+
def self.from_args(id, policy, field, rule, value)
|
45
|
+
SimpleRule.new(
|
46
|
+
{
|
47
|
+
'id' => id,
|
48
|
+
'policy' => policy,
|
49
|
+
'field' => field,
|
50
|
+
'rule' => rule,
|
51
|
+
'value' => value
|
52
|
+
}
|
53
|
+
)
|
54
|
+
end
|
55
|
+
|
56
|
+
DEFAULT_RULE = SimpleRule.new(
|
57
|
+
'policy' => 'include',
|
58
|
+
'field' => '_',
|
59
|
+
'rule' => 'regex',
|
60
|
+
'value' => '.*',
|
61
|
+
'id' => SimpleRule::DEFAULT_RULE_ID
|
62
|
+
)
|
63
|
+
|
64
|
+
def match?(document)
|
65
|
+
return true if id == DEFAULT_RULE_ID
|
66
|
+
doc_value = document[field]
|
67
|
+
return false if doc_value.nil?
|
68
|
+
coerced_value = coerce(doc_value)
|
69
|
+
case rule
|
70
|
+
when Rule::EQUALS
|
71
|
+
case coerced_value
|
72
|
+
when Integer
|
73
|
+
doc_value == coerced_value
|
74
|
+
when DateTime, Time
|
75
|
+
doc_value.to_s == coerced_value.to_s
|
76
|
+
else
|
77
|
+
doc_value.to_s == coerced_value
|
78
|
+
end
|
79
|
+
when Rule::STARTS_WITH
|
80
|
+
doc_value.to_s.start_with?(value)
|
81
|
+
when Rule::ENDS_WITH
|
82
|
+
doc_value.to_s.end_with?(value)
|
83
|
+
when Rule::CONTAINS
|
84
|
+
doc_value.to_s.include?(value)
|
85
|
+
when Rule::REGEX
|
86
|
+
doc_value.to_s.match(/#{value}/)
|
87
|
+
when Rule::LESS_THAN
|
88
|
+
doc_value < coerced_value
|
89
|
+
when Rule::GREATER_THAN
|
90
|
+
doc_value > coerced_value
|
91
|
+
else
|
92
|
+
false
|
93
|
+
end
|
94
|
+
end
|
95
|
+
|
96
|
+
def coerce(doc_value)
|
97
|
+
case doc_value
|
98
|
+
when String
|
99
|
+
value.to_s
|
100
|
+
when Integer
|
101
|
+
value.to_i
|
102
|
+
when DateTime, Time
|
103
|
+
to_date(value)
|
104
|
+
when TrueClass, FalseClass # Ruby doesn't have a Boolean type, TIL
|
105
|
+
to_bool(value).to_s
|
106
|
+
else
|
107
|
+
value.to_s
|
108
|
+
end
|
109
|
+
rescue StandardError => e
|
110
|
+
Utility::Logger.debug("Failed to coerce value '#{value}' (#{value.class}) based on document value '#{doc_value}' (#{doc_value.class}) due to error: #{e.class}: #{e.message}")
|
111
|
+
value.to_s
|
112
|
+
end
|
113
|
+
|
114
|
+
def is_include?
|
115
|
+
policy == Policy::INCLUDE
|
116
|
+
end
|
117
|
+
|
118
|
+
def is_exclude?
|
119
|
+
policy == Policy::EXCLUDE
|
120
|
+
end
|
121
|
+
|
122
|
+
def to_h
|
123
|
+
@rule_hash
|
124
|
+
end
|
125
|
+
|
126
|
+
private
|
127
|
+
|
128
|
+
def to_bool(str)
|
129
|
+
return true if str == true || str =~ (/^(true|t|yes|y|on|1)$/i)
|
130
|
+
return false if str == false || str.blank? || str =~ (/^(false|f|no|n|off|0)$/i)
|
131
|
+
raise ArgumentError.new("invalid value for Boolean: \"#{str}\"")
|
132
|
+
end
|
133
|
+
|
134
|
+
def to_date(str)
|
135
|
+
DateTime.parse(str)
|
136
|
+
rescue ArgumentError
|
137
|
+
Time.at(str.to_i) # try with it as an int string of millis
|
138
|
+
end
|
139
|
+
end
|
140
|
+
end
|
141
|
+
end
|