openc3 7.0.1 → 7.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/bin/openc3cli +47 -3
- data/data/config/item_modifiers.yaml +1 -1
- data/data/config/microservice.yaml +12 -1
- data/data/config/parameter_modifiers.yaml +49 -7
- data/data/config/target.yaml +11 -0
- data/data/config/target_config.yaml +6 -2
- data/lib/openc3/api/cmd_api.rb +2 -1
- data/lib/openc3/api/metrics_api.rb +11 -1
- data/lib/openc3/api/tlm_api.rb +21 -6
- data/lib/openc3/core_ext/faraday.rb +1 -1
- data/lib/openc3/io/json_api.rb +1 -1
- data/lib/openc3/logs/log_writer.rb +3 -1
- data/lib/openc3/microservices/decom_common.rb +128 -0
- data/lib/openc3/microservices/decom_microservice.rb +26 -95
- data/lib/openc3/microservices/interface_decom_common.rb +6 -2
- data/lib/openc3/microservices/interface_microservice.rb +10 -8
- data/lib/openc3/microservices/log_microservice.rb +1 -1
- data/lib/openc3/microservices/microservice.rb +3 -2
- data/lib/openc3/microservices/queue_microservice.rb +1 -1
- data/lib/openc3/microservices/scope_cleanup_microservice.rb +60 -46
- data/lib/openc3/microservices/text_log_microservice.rb +1 -2
- data/lib/openc3/models/cvt_model.rb +24 -13
- data/lib/openc3/models/db_sharded_model.rb +110 -0
- data/lib/openc3/models/interface_model.rb +9 -0
- data/lib/openc3/models/interface_status_model.rb +33 -3
- data/lib/openc3/models/metric_model.rb +96 -37
- data/lib/openc3/models/microservice_model.rb +7 -0
- data/lib/openc3/models/microservice_status_model.rb +30 -3
- data/lib/openc3/models/reingest_job_model.rb +153 -0
- data/lib/openc3/models/scope_model.rb +3 -2
- data/lib/openc3/models/script_status_model.rb +4 -20
- data/lib/openc3/models/target_model.rb +113 -100
- data/lib/openc3/packets/packet_config.rb +4 -1
- data/lib/openc3/script/script.rb +2 -2
- data/lib/openc3/script/script_runner.rb +4 -4
- data/lib/openc3/script/telemetry.rb +3 -3
- data/lib/openc3/script/web_socket_api.rb +29 -22
- data/lib/openc3/system/system.rb +20 -3
- data/lib/openc3/topics/command_decom_topic.rb +4 -2
- data/lib/openc3/topics/command_topic.rb +8 -5
- data/lib/openc3/topics/decom_interface_topic.rb +15 -10
- data/lib/openc3/topics/interface_topic.rb +71 -29
- data/lib/openc3/topics/limits_event_topic.rb +62 -41
- data/lib/openc3/topics/router_topic.rb +61 -21
- data/lib/openc3/topics/system_events_topic.rb +18 -1
- data/lib/openc3/topics/telemetry_decom_topic.rb +2 -1
- data/lib/openc3/topics/telemetry_topic.rb +4 -2
- data/lib/openc3/topics/topic.rb +77 -5
- data/lib/openc3/utilities/aws_bucket.rb +2 -0
- data/lib/openc3/utilities/cli_generator.rb +3 -2
- data/lib/openc3/utilities/metric.rb +15 -1
- data/lib/openc3/utilities/questdb_client.rb +173 -37
- data/lib/openc3/utilities/reingest_job.rb +377 -0
- data/lib/openc3/utilities/ruby_lex_utils.rb +2 -0
- data/lib/openc3/utilities/store_autoload.rb +78 -52
- data/lib/openc3/utilities/store_queued.rb +20 -12
- data/lib/openc3/version.rb +6 -6
- data/templates/plugin/plugin.gemspec +13 -1
- data/templates/tool_angular/package.json +2 -2
- data/templates/tool_react/package.json +1 -1
- data/templates/tool_svelte/package.json +1 -1
- data/templates/tool_vue/package.json +3 -3
- data/templates/tool_vue/src/router.js +2 -2
- data/templates/widget/package.json +2 -2
- metadata +7 -3
|
@@ -28,42 +28,104 @@ module OpenC3
|
|
|
28
28
|
class QuestDBError < StandardError; end
|
|
29
29
|
|
|
30
30
|
# Thread-local PG connection storage using Concurrent::ThreadLocalVar.
|
|
31
|
-
# Each thread gets its own
|
|
31
|
+
# Each thread gets its own connections (per db_shard) to avoid thread-safety issues with PG::Connection.
|
|
32
32
|
# Connections are automatically garbage collected when threads terminate.
|
|
33
|
-
|
|
33
|
+
# Value is a Hash: { db_shard_number => PG::Connection }
|
|
34
|
+
@thread_conns = Concurrent::ThreadLocalVar.new { Hash.new } # NOSONAR
|
|
35
|
+
|
|
36
|
+
# DB_Shard cache: { "scope__target_name" => [db_shard_number, Time] }
|
|
37
|
+
@db_shard_cache = {}
|
|
38
|
+
@db_shard_cache_mutex = Mutex.new
|
|
39
|
+
DB_SHARD_CACHE_TIMEOUT = 60 # seconds
|
|
40
|
+
|
|
41
|
+
# Resolve the hostname for a given db_shard number.
|
|
42
|
+
# If OPENC3_TSDB_HOSTNAME contains "SHARDNUM", it is replaced with the db_shard number.
|
|
43
|
+
# Otherwise, all db_shards connect to the same host (backward compatible).
|
|
44
|
+
def self.hostname_for_db_shard(db_shard)
|
|
45
|
+
ENV['OPENC3_TSDB_HOSTNAME'].to_s.gsub("SHARDNUM", db_shard.to_s)
|
|
46
|
+
end
|
|
47
|
+
|
|
48
|
+
# Look up the db_shard number for a target from TargetModel with a 1-minute cache.
|
|
49
|
+
# Non-target-specific data (nil target_name) always returns db_shard 0.
|
|
50
|
+
def self.db_shard_for_target(target_name, scope: "DEFAULT")
|
|
51
|
+
return 0 unless target_name
|
|
52
|
+
|
|
53
|
+
cache_key = "#{scope}__#{target_name}"
|
|
54
|
+
now = Time.now
|
|
55
|
+
|
|
56
|
+
@db_shard_cache_mutex.synchronize do
|
|
57
|
+
cached = @db_shard_cache[cache_key]
|
|
58
|
+
if cached
|
|
59
|
+
db_shard, cached_at = cached
|
|
60
|
+
return db_shard if (now - cached_at) < DB_SHARD_CACHE_TIMEOUT
|
|
61
|
+
end
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Cache miss or expired — look up from TargetModel
|
|
65
|
+
begin
|
|
66
|
+
model = TargetModel.get(name: target_name, scope: scope)
|
|
67
|
+
db_shard = model ? model['db_shard'].to_i : 0
|
|
68
|
+
rescue
|
|
69
|
+
db_shard = 0
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
@db_shard_cache_mutex.synchronize do
|
|
73
|
+
@db_shard_cache[cache_key] = [db_shard, now]
|
|
74
|
+
end
|
|
75
|
+
|
|
76
|
+
db_shard
|
|
77
|
+
end
|
|
34
78
|
|
|
35
|
-
# Get or create a thread-local PG connection with type mapping configured.
|
|
79
|
+
# Get or create a thread-local PG connection for the given db_shard with type mapping configured.
|
|
36
80
|
# Returns the thread-local connection - callers should not close it.
|
|
37
|
-
def self.connection
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn)
|
|
48
|
-
@thread_conn.value = conn
|
|
81
|
+
def self.connection(db_shard: 0)
|
|
82
|
+
conns = @thread_conns.value
|
|
83
|
+
conn = conns[db_shard]
|
|
84
|
+
if conn and not conn.finished?
|
|
85
|
+
begin
|
|
86
|
+
conn.check_socket
|
|
87
|
+
return conn
|
|
88
|
+
rescue
|
|
89
|
+
# Will need to reconnect
|
|
90
|
+
end
|
|
49
91
|
end
|
|
92
|
+
conn = PG::Connection.new(
|
|
93
|
+
host: hostname_for_db_shard(db_shard),
|
|
94
|
+
port: ENV['OPENC3_TSDB_QUERY_PORT'],
|
|
95
|
+
user: ENV['OPENC3_TSDB_USERNAME'],
|
|
96
|
+
password: ENV['OPENC3_TSDB_PASSWORD'],
|
|
97
|
+
dbname: 'qdb'
|
|
98
|
+
)
|
|
99
|
+
conn.type_map_for_results = PG::BasicTypeMapForResults.new(conn)
|
|
100
|
+
conns[db_shard] = conn
|
|
101
|
+
@thread_conns.value = conns
|
|
50
102
|
conn
|
|
51
103
|
end
|
|
52
104
|
|
|
53
|
-
# Reset the connection for the current thread. Used after errors.
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
105
|
+
# Reset the connection(s) for the current thread. Used after errors.
|
|
106
|
+
# If db_shard is nil, closes all db_shard connections. Otherwise closes only the specified db_shard.
|
|
107
|
+
def self.disconnect(db_shard: nil)
|
|
108
|
+
conns = @thread_conns.value
|
|
109
|
+
if db_shard.nil?
|
|
110
|
+
conns.each_value do |conn|
|
|
111
|
+
conn.finish if conn && !conn.finished?
|
|
112
|
+
end
|
|
113
|
+
@thread_conns.value = {}
|
|
114
|
+
else
|
|
115
|
+
conn = conns[db_shard]
|
|
116
|
+
if conn && !conn.finished?
|
|
117
|
+
conn.finish
|
|
118
|
+
end
|
|
119
|
+
conns.delete(db_shard)
|
|
120
|
+
@thread_conns.value = conns
|
|
58
121
|
end
|
|
59
|
-
@thread_conn.value = nil
|
|
60
122
|
end
|
|
61
123
|
|
|
62
124
|
# Health check - attempt to connect and immediately close.
|
|
63
125
|
# Returns true if successful, raises on failure.
|
|
64
|
-
def self.check_connection
|
|
126
|
+
def self.check_connection(db_shard: 0)
|
|
65
127
|
conn = PG::Connection.new(
|
|
66
|
-
host:
|
|
128
|
+
host: hostname_for_db_shard(db_shard),
|
|
67
129
|
port: ENV['OPENC3_TSDB_QUERY_PORT'],
|
|
68
130
|
user: ENV['OPENC3_TSDB_USERNAME'],
|
|
69
131
|
password: ENV['OPENC3_TSDB_PASSWORD'],
|
|
@@ -291,14 +353,14 @@ module OpenC3
|
|
|
291
353
|
# @param label [String, nil] Optional label for log messages
|
|
292
354
|
# @return [PG::Result, nil] Query result
|
|
293
355
|
# @raise [RuntimeError] After exhausting retries
|
|
294
|
-
def self.query_with_retry(query, params: [], max_retries: 5, label: nil)
|
|
356
|
+
def self.query_with_retry(query, params: [], max_retries: 5, label: nil, db_shard: 0)
|
|
295
357
|
retry_count = 0
|
|
296
358
|
begin
|
|
297
|
-
conn = connection
|
|
359
|
+
conn = connection(db_shard: db_shard)
|
|
298
360
|
if params.empty?
|
|
299
|
-
conn.exec(query)
|
|
361
|
+
return conn.exec(query)
|
|
300
362
|
else
|
|
301
|
-
conn.exec_params(query, params)
|
|
363
|
+
return conn.exec_params(query, params)
|
|
302
364
|
end
|
|
303
365
|
rescue IOError, PG::Error => e
|
|
304
366
|
retry_count += 1
|
|
@@ -307,7 +369,7 @@ module OpenC3
|
|
|
307
369
|
end
|
|
308
370
|
Logger.warn("TSDB#{label ? " #{label}" : ""}: Retrying due to error: #{e.message}")
|
|
309
371
|
Logger.warn("TSDB#{label ? " #{label}" : ""}: Last query: #{query}")
|
|
310
|
-
disconnect
|
|
372
|
+
disconnect(db_shard: db_shard)
|
|
311
373
|
sleep 0.1
|
|
312
374
|
retry
|
|
313
375
|
end
|
|
@@ -543,11 +605,11 @@ module OpenC3
|
|
|
543
605
|
# @param start_time [Integer] Nanosecond start time
|
|
544
606
|
# @param end_time [Integer, nil] Nanosecond end time
|
|
545
607
|
# @return [Boolean]
|
|
546
|
-
def self.table_has_data?(table_name, start_time, end_time)
|
|
547
|
-
query = "SELECT 1 FROM #{table_name}"
|
|
608
|
+
def self.table_has_data?(table_name, start_time, end_time, db_shard: 0)
|
|
609
|
+
query = "SELECT 1 FROM \"#{table_name}\""
|
|
548
610
|
query += time_where_clause(start_time, end_time)
|
|
549
611
|
query += " LIMIT 1"
|
|
550
|
-
result = query_with_retry(query, max_retries: 1, label: "table_has_data")
|
|
612
|
+
result = query_with_retry(query, max_retries: 1, label: "table_has_data", db_shard: db_shard)
|
|
551
613
|
result && result.ntuples > 0
|
|
552
614
|
rescue RuntimeError
|
|
553
615
|
false
|
|
@@ -560,13 +622,13 @@ module OpenC3
|
|
|
560
622
|
# @param page_size [Integer] Number of rows per page
|
|
561
623
|
# @param label [String] Label for log messages
|
|
562
624
|
# @yield [PG::Result] Each page of results
|
|
563
|
-
def self.paginate_query(query, page_size, label:)
|
|
625
|
+
def self.paginate_query(query, page_size, label:, db_shard: 0)
|
|
564
626
|
min = 0
|
|
565
627
|
max = page_size
|
|
566
628
|
loop do
|
|
567
629
|
query_offset = "#{query} LIMIT #{min}, #{max}"
|
|
568
630
|
Logger.debug("QuestDB #{label}: #{query_offset}")
|
|
569
|
-
result = query_with_retry(query_offset, label: label)
|
|
631
|
+
result = query_with_retry(query_offset, label: label, db_shard: db_shard)
|
|
570
632
|
min += page_size
|
|
571
633
|
max += page_size
|
|
572
634
|
if result.nil? or result.ntuples == 0
|
|
@@ -590,7 +652,7 @@ module OpenC3
|
|
|
590
652
|
names << TIMESTAMP_SELECT
|
|
591
653
|
names << "RECEIVED_TIMESECONDS" if include_received_ts
|
|
592
654
|
names << "COSMOS_EXTRA"
|
|
593
|
-
query = "SELECT #{names.join(', ')} FROM #{table_name}"
|
|
655
|
+
query = "SELECT #{names.join(', ')} FROM \"#{table_name}\""
|
|
594
656
|
query += time_where_clause(start_time, end_time)
|
|
595
657
|
query
|
|
596
658
|
end
|
|
@@ -808,6 +870,8 @@ module OpenC3
|
|
|
808
870
|
|
|
809
871
|
# Query historical telemetry data from QuestDB for a list of items.
|
|
810
872
|
# Builds the SQL query, executes it, and decodes all results.
|
|
873
|
+
# Supports cross-db_shard queries by grouping items by db_shard, executing
|
|
874
|
+
# separate queries per db_shard, and merging results positionally.
|
|
811
875
|
#
|
|
812
876
|
# @param items [Array] Array of [target_name, packet_name, item_name, value_type, limits]
|
|
813
877
|
# item_name may be nil to indicate a placeholder (non-existent item)
|
|
@@ -817,6 +881,78 @@ module OpenC3
|
|
|
817
881
|
# @return [Array, Hash] Array of [value, limits_state] pairs per row, or {} if no results.
|
|
818
882
|
# Single-row results return a flat array; multi-row results return array of arrays.
|
|
819
883
|
def self.tsdb_lookup(items, start_time:, end_time: nil, scope: "DEFAULT")
|
|
884
|
+
# Group items by db_shard number while preserving their original positions
|
|
885
|
+
db_shard_groups = {} # db_shard => { positions: [], items: [] }
|
|
886
|
+
items.each_with_index do |item, pos|
|
|
887
|
+
target_name = item[0]
|
|
888
|
+
db_shard = db_shard_for_target(target_name, scope: scope)
|
|
889
|
+
db_shard_groups[db_shard] ||= { positions: [], items: [] }
|
|
890
|
+
db_shard_groups[db_shard][:positions] << pos
|
|
891
|
+
db_shard_groups[db_shard][:items] << item
|
|
892
|
+
end
|
|
893
|
+
|
|
894
|
+
# Single-db_shard fast path (most common case)
|
|
895
|
+
if db_shard_groups.length == 1
|
|
896
|
+
db_shard, group = db_shard_groups.first
|
|
897
|
+
return tsdb_lookup_single_db_shard(group[:items], start_time: start_time, end_time: end_time, scope: scope, db_shard: db_shard)
|
|
898
|
+
end
|
|
899
|
+
|
|
900
|
+
# Cross-db_shard: execute per-db_shard queries and merge results
|
|
901
|
+
db_shard_results = {} # db_shard => data
|
|
902
|
+
db_shard_groups.each do |db_shard, group|
|
|
903
|
+
result = tsdb_lookup_single_db_shard(group[:items], start_time: start_time, end_time: end_time, scope: scope, db_shard: db_shard)
|
|
904
|
+
db_shard_results[db_shard] = result
|
|
905
|
+
end
|
|
906
|
+
|
|
907
|
+
# If all db_shards returned empty, return empty
|
|
908
|
+
return {} if db_shard_results.values.all? { |r| r == {} }
|
|
909
|
+
|
|
910
|
+
# Merge results positionally back into the original item order.
|
|
911
|
+
# For single-row results (no end_time), merge flat arrays.
|
|
912
|
+
# For multi-row results, each db_shard may have different row counts;
|
|
913
|
+
# use the maximum row count and fill missing positions with [nil, nil].
|
|
914
|
+
if !end_time
|
|
915
|
+
# Single-row mode: each db_shard returns a flat array of [value, limits] pairs.
|
|
916
|
+
# Merge them into the original item order.
|
|
917
|
+
merged = Array.new(items.length) { [nil, nil] }
|
|
918
|
+
db_shard_groups.each do |db_shard, group|
|
|
919
|
+
result = db_shard_results[db_shard]
|
|
920
|
+
next if result == {} || !result.is_a?(Array)
|
|
921
|
+
group[:positions].each_with_index do |orig_pos, db_shard_idx|
|
|
922
|
+
merged[orig_pos] = result[db_shard_idx] if result[db_shard_idx]
|
|
923
|
+
end
|
|
924
|
+
end
|
|
925
|
+
merged
|
|
926
|
+
else
|
|
927
|
+
# Multi-row mode: find max row count across db_shards
|
|
928
|
+
max_rows = 0
|
|
929
|
+
db_shard_groups.each do |db_shard, _group|
|
|
930
|
+
result = db_shard_results[db_shard]
|
|
931
|
+
next if result == {}
|
|
932
|
+
count = result.is_a?(Array) ? result.length : 0
|
|
933
|
+
max_rows = count if count > max_rows
|
|
934
|
+
end
|
|
935
|
+
return {} if max_rows == 0
|
|
936
|
+
|
|
937
|
+
merged = Array.new(max_rows) { Array.new(items.length) { [nil, nil] } }
|
|
938
|
+
db_shard_groups.each do |db_shard, group|
|
|
939
|
+
result = db_shard_results[db_shard]
|
|
940
|
+
next if result == {}
|
|
941
|
+
rows = result.is_a?(Array) ? result : []
|
|
942
|
+
rows.each_with_index do |row, row_num|
|
|
943
|
+
next unless row.is_a?(Array)
|
|
944
|
+
group[:positions].each_with_index do |orig_pos, db_shard_idx|
|
|
945
|
+
merged[row_num][orig_pos] = row[db_shard_idx] if row[db_shard_idx]
|
|
946
|
+
end
|
|
947
|
+
end
|
|
948
|
+
end
|
|
949
|
+
merged
|
|
950
|
+
end
|
|
951
|
+
end
|
|
952
|
+
|
|
953
|
+
# Execute a tsdb_lookup query against a single db_shard.
|
|
954
|
+
# This contains the original ASOF JOIN logic for items all on the same QuestDB instance.
|
|
955
|
+
def self.tsdb_lookup_single_db_shard(items, start_time:, end_time: nil, scope: "DEFAULT", db_shard: 0)
|
|
820
956
|
tables = {}
|
|
821
957
|
names = []
|
|
822
958
|
nil_count = 0
|
|
@@ -888,9 +1024,9 @@ module OpenC3
|
|
|
888
1024
|
query = "SELECT #{names.join(", ")} FROM "
|
|
889
1025
|
tables.each_with_index do |(table_name, _), index|
|
|
890
1026
|
if index == 0
|
|
891
|
-
query += "#{table_name} as T#{index} "
|
|
1027
|
+
query += "\"#{table_name}\" as T#{index} "
|
|
892
1028
|
else
|
|
893
|
-
query += "ASOF JOIN #{table_name} as T#{index} "
|
|
1029
|
+
query += "ASOF JOIN \"#{table_name}\" as T#{index} "
|
|
894
1030
|
end
|
|
895
1031
|
end
|
|
896
1032
|
query_params = []
|
|
@@ -903,7 +1039,7 @@ module OpenC3
|
|
|
903
1039
|
query_params << end_time
|
|
904
1040
|
end
|
|
905
1041
|
|
|
906
|
-
result = query_with_retry(query, params: query_params, label: "tsdb_lookup")
|
|
1042
|
+
result = query_with_retry(query, params: query_params, label: "tsdb_lookup", db_shard: db_shard)
|
|
907
1043
|
if result.nil? or result.ntuples == 0
|
|
908
1044
|
return {}
|
|
909
1045
|
end
|
|
@@ -0,0 +1,377 @@
|
|
|
1
|
+
# encoding: ascii-8bit
|
|
2
|
+
|
|
3
|
+
# Copyright 2026 OpenC3, Inc.
|
|
4
|
+
# All Rights Reserved.
|
|
5
|
+
#
|
|
6
|
+
# This program is distributed in the hope that it will be useful,
|
|
7
|
+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
8
|
+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
9
|
+
# See LICENSE.md for more details.
|
|
10
|
+
#
|
|
11
|
+
# This file may also be used under the terms of a commercial license
|
|
12
|
+
# if purchased from OpenC3, Inc.
|
|
13
|
+
|
|
14
|
+
require 'fileutils'
|
|
15
|
+
require 'tmpdir'
|
|
16
|
+
require 'openc3/system/system'
|
|
17
|
+
require 'openc3/utilities/bucket'
|
|
18
|
+
require 'openc3/utilities/bucket_utilities'
|
|
19
|
+
require 'openc3/utilities/logger'
|
|
20
|
+
require 'openc3/utilities/questdb_client'
|
|
21
|
+
require 'openc3/logs/packet_log_reader'
|
|
22
|
+
require 'openc3/microservices/decom_common'
|
|
23
|
+
require 'openc3/models/reingest_job_model'
|
|
24
|
+
|
|
25
|
+
module OpenC3
|
|
26
|
+
class ReingestJobError < StandardError; end
|
|
27
|
+
|
|
28
|
+
# Replays raw .bin.gz log files from a bucket, decommutating each packet via
|
|
29
|
+
# DecomCommon.decom_and_publish(check_limits: false) so historical data
|
|
30
|
+
# reaches QuestDB without re-firing limits events.
|
|
31
|
+
#
|
|
32
|
+
# Runs synchronously (caller wraps in a Thread). Tracks state in a
|
|
33
|
+
# ReingestJobModel. DEDUP is enabled on affected tables during the job and
|
|
34
|
+
# disabled in an ensure block on completion (or after a cooldown window so
|
|
35
|
+
# in-flight WAL commits are covered).
|
|
36
|
+
#
|
|
37
|
+
# target_version:
|
|
38
|
+
# - 'as_logged' (default): each file is decoded with the target config hash
|
|
39
|
+
# that was in effect when the packets were originally logged. Files are
|
|
40
|
+
# grouped by their embedded target_id and System is rebuilt per group.
|
|
41
|
+
# - 'current': all files are decoded with the latest target config.
|
|
42
|
+
# - <hash>: explicit hash, used for every file in the job.
|
|
43
|
+
class ReingestJob
|
|
44
|
+
# How often to persist progress during the ingest pass (write every N packets)
|
|
45
|
+
STATUS_UPDATE_EVERY = 500
|
|
46
|
+
# How often to tick the heartbeat during the cooldown sleep
|
|
47
|
+
HEARTBEAT_INTERVAL_SEC = 10
|
|
48
|
+
|
|
49
|
+
# Reingest rebuilds the process-global System singleton. Serialize all
|
|
50
|
+
# reingest jobs running in this process so they don't stomp each other.
|
|
51
|
+
@@run_mutex = Mutex.new
|
|
52
|
+
|
|
53
|
+
def initialize(job_id:, files:, path:, bucket:, scope:,
|
|
54
|
+
target_version: 'as_logged',
|
|
55
|
+
dedup_cooldown_seconds: ENV.fetch('OPENC3_REINGEST_DEDUP_COOLDOWN', 60).to_i,
|
|
56
|
+
logger: Logger)
|
|
57
|
+
@job_id = job_id
|
|
58
|
+
@files = files
|
|
59
|
+
@path = path
|
|
60
|
+
@bucket_env = bucket
|
|
61
|
+
@scope = scope
|
|
62
|
+
@target_version = target_version
|
|
63
|
+
@dedup_cooldown_seconds = dedup_cooldown_seconds
|
|
64
|
+
@logger = logger
|
|
65
|
+
end
|
|
66
|
+
|
|
67
|
+
def run
|
|
68
|
+
tmp_dir = Dir.mktmpdir
|
|
69
|
+
job = load_job
|
|
70
|
+
dedup_enabled_by_us = []
|
|
71
|
+
db_shard = 0
|
|
72
|
+
@@run_mutex.synchronize do
|
|
73
|
+
begin
|
|
74
|
+
mark(job, state: 'Running', progress_phase: 'downloading',
|
|
75
|
+
started_at: Time.now.utc.iso8601,
|
|
76
|
+
progress_total: @files.length)
|
|
77
|
+
|
|
78
|
+
# Parse target from path, e.g. "DEFAULT/raw_logs/tlm/INST/20260421/"
|
|
79
|
+
# → "INST". Fail fast if the path doesn't encode one — otherwise
|
|
80
|
+
# ingest would run against whatever System was loaded in this process
|
|
81
|
+
# from a prior job (or raise opaquely inside PacketLogReader), and
|
|
82
|
+
# the job could mark Complete with rows written under the wrong
|
|
83
|
+
# target config.
|
|
84
|
+
path_parts = @path.to_s.split('/').reject(&:empty?)
|
|
85
|
+
unless path_parts.length >= 4 && path_parts[1] == 'raw_logs'
|
|
86
|
+
raise ReingestJobError, "Cannot determine target from path '#{@path}'; expected '{scope}/raw_logs/{tlm|cmd}/{target}/'"
|
|
87
|
+
end
|
|
88
|
+
target = path_parts[3]
|
|
89
|
+
db_shard = QuestDBClient.db_shard_for_target(target, scope: @scope)
|
|
90
|
+
|
|
91
|
+
local_files = download_and_uncompress(job, tmp_dir)
|
|
92
|
+
|
|
93
|
+
# Pass 1: read raw (no System required) to discover table names and
|
|
94
|
+
# each file's embedded target hash. File hashes are what the "as
|
|
95
|
+
# logged" mode uses to pick the right target_version per file.
|
|
96
|
+
mark(job, progress_phase: 'enabling_dedup', progress_current: 0,
|
|
97
|
+
progress_total: 0)
|
|
98
|
+
table_names, file_versions = discover_tables_and_versions(local_files)
|
|
99
|
+
mark(job, table_names: table_names, progress_total: table_names.length)
|
|
100
|
+
|
|
101
|
+
dedup_enabled_by_us, preexisting = enable_dedup(job, table_names, db_shard)
|
|
102
|
+
mark(job,
|
|
103
|
+
dedup_enabled_by_us: dedup_enabled_by_us,
|
|
104
|
+
dedup_preexisting: preexisting,
|
|
105
|
+
dedup_enabled_at: Time.now.utc.iso8601)
|
|
106
|
+
|
|
107
|
+
# Pass 2: group files by the target_version we'll load for them,
|
|
108
|
+
# then ingest each group under its own System instance.
|
|
109
|
+
groups = group_files_by_version(local_files, file_versions)
|
|
110
|
+
mark(job, versions_used: groups.keys,
|
|
111
|
+
progress_phase: 'ingesting', progress_current: 0,
|
|
112
|
+
progress_total: 0, packets_written: 0)
|
|
113
|
+
ingest_all_groups(job, groups, target)
|
|
114
|
+
|
|
115
|
+
mark(job, progress_phase: 'dedup_cooldown')
|
|
116
|
+
cooldown(job)
|
|
117
|
+
|
|
118
|
+
mark(job, progress_phase: 'disabling_dedup')
|
|
119
|
+
disabled = disable_dedup(job, dedup_enabled_by_us, db_shard)
|
|
120
|
+
mark(job, dedup_disabled_tables: disabled,
|
|
121
|
+
dedup_disabled_at: Time.now.utc.iso8601,
|
|
122
|
+
state: 'Complete',
|
|
123
|
+
finished_at: Time.now.utc.iso8601)
|
|
124
|
+
rescue Exception => e
|
|
125
|
+
@logger.error("Reingest job #{@job_id} failed: #{e.message}\n#{e.backtrace.first(10).join("\n")}")
|
|
126
|
+
# Always try to revert DEDUP even on crash so user tables are not left altered
|
|
127
|
+
disabled_on_crash = []
|
|
128
|
+
begin
|
|
129
|
+
disabled_on_crash = disable_dedup(job, dedup_enabled_by_us, db_shard)
|
|
130
|
+
rescue => de
|
|
131
|
+
@logger.error("Reingest job #{@job_id} failed to disable DEDUP during crash cleanup: #{de.message}")
|
|
132
|
+
end
|
|
133
|
+
mark(job,
|
|
134
|
+
dedup_disabled_tables: disabled_on_crash,
|
|
135
|
+
dedup_disabled_at: Time.now.utc.iso8601,
|
|
136
|
+
state: 'Crashed',
|
|
137
|
+
error: e.message,
|
|
138
|
+
finished_at: Time.now.utc.iso8601)
|
|
139
|
+
ensure
|
|
140
|
+
FileUtils.remove_entry_secure(tmp_dir, true) if tmp_dir && File.directory?(tmp_dir)
|
|
141
|
+
end
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
private
|
|
146
|
+
|
|
147
|
+
def load_job
|
|
148
|
+
ReingestJobModel.get_model(name: @job_id, scope: @scope) or
|
|
149
|
+
raise ReingestJobError, "ReingestJobModel #{@job_id} not found in scope #{@scope}"
|
|
150
|
+
end
|
|
151
|
+
|
|
152
|
+
# Merge attrs into the model and persist. Model#update refreshes updated_at,
|
|
153
|
+
# which doubles as the heartbeat used by the stale-check.
|
|
154
|
+
def mark(job, **attrs)
|
|
155
|
+
attrs.each { |k, v| job.send("#{k}=", v) }
|
|
156
|
+
job.update
|
|
157
|
+
end
|
|
158
|
+
|
|
159
|
+
def download_and_uncompress(job, tmp_dir)
|
|
160
|
+
bucket_name = ENV.fetch(@bucket_env) { |name| raise ReingestJobError, "Unknown bucket #{name}" }
|
|
161
|
+
bucket_client = Bucket.getClient()
|
|
162
|
+
local_files = []
|
|
163
|
+
tmp_root = File.expand_path(tmp_dir) + File::SEPARATOR
|
|
164
|
+
@files.each_with_index do |filename, i|
|
|
165
|
+
key = "#{@path}#{filename}"
|
|
166
|
+
temp_file = File.expand_path(File.join(tmp_dir, filename))
|
|
167
|
+
# Defense-in-depth: the controller validates filenames, but refuse to
|
|
168
|
+
# write outside tmp_dir if any caller (tests, future callers) bypasses it.
|
|
169
|
+
unless temp_file.start_with?(tmp_root)
|
|
170
|
+
raise ReingestJobError, "Invalid filename escapes tmp dir: #{filename}"
|
|
171
|
+
end
|
|
172
|
+
FileUtils.mkdir_p(File.dirname(temp_file))
|
|
173
|
+
bucket_client.get_object(bucket: bucket_name, key: key, path: temp_file)
|
|
174
|
+
if File.extname(filename) == '.gz'
|
|
175
|
+
decompressed = BucketUtilities.uncompress_file(temp_file)
|
|
176
|
+
File.delete(temp_file)
|
|
177
|
+
local_files << decompressed
|
|
178
|
+
else
|
|
179
|
+
local_files << temp_file
|
|
180
|
+
end
|
|
181
|
+
mark(job, progress_current: i + 1)
|
|
182
|
+
end
|
|
183
|
+
local_files
|
|
184
|
+
end
|
|
185
|
+
|
|
186
|
+
# Read each file in raw mode (identify_and_define=false) to collect
|
|
187
|
+
# {scope}__TLM__{target}__{packet} table names and the first target
|
|
188
|
+
# declaration hash embedded in each file. Returns [table_names, file_versions]
|
|
189
|
+
# where file_versions maps local_file_path → hex hash (or nil if the file
|
|
190
|
+
# has no hash, which happens for pre-6.x log files).
|
|
191
|
+
def discover_tables_and_versions(local_files)
|
|
192
|
+
table_names = Set.new
|
|
193
|
+
file_versions = {}
|
|
194
|
+
local_files.each do |local_file|
|
|
195
|
+
reader = PacketLogReader.new
|
|
196
|
+
reader.each(local_file, false) do |packet|
|
|
197
|
+
next unless packet.target_name && packet.packet_name
|
|
198
|
+
cmd_or_tlm = packet.cmd_or_tlm == :CMD ? 'CMD' : 'TLM'
|
|
199
|
+
table_names.add("#{@scope}__#{cmd_or_tlm}__#{packet.target_name}__#{packet.packet_name}")
|
|
200
|
+
end
|
|
201
|
+
ids = reader.instance_variable_get(:@target_ids) || []
|
|
202
|
+
file_versions[local_file] = ids.first ? ids.first.unpack1('H*') : nil
|
|
203
|
+
end
|
|
204
|
+
[table_names.to_a, file_versions]
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
# Returns a Hash of target_version → [local_file, ...]. The value at key
|
|
208
|
+
# 'current' means "use System with the latest config"; any other value is
|
|
209
|
+
# a specific hash string used as target_version in System.setup_targets.
|
|
210
|
+
def group_files_by_version(local_files, file_versions)
|
|
211
|
+
groups = Hash.new { |h, k| h[k] = [] }
|
|
212
|
+
case @target_version
|
|
213
|
+
when 'current'
|
|
214
|
+
groups['current'] = local_files.dup
|
|
215
|
+
when 'as_logged', nil
|
|
216
|
+
local_files.each do |file|
|
|
217
|
+
version = file_versions[file] || 'current'
|
|
218
|
+
groups[version] << file
|
|
219
|
+
end
|
|
220
|
+
else
|
|
221
|
+
# Caller passed an explicit hash; use it for every file.
|
|
222
|
+
groups[@target_version] = local_files.dup
|
|
223
|
+
end
|
|
224
|
+
groups
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
# For each version group, rebuild System under that version and ingest
|
|
228
|
+
# the group's files. `@@run_mutex` in `run` protects other threads from
|
|
229
|
+
# seeing a transient nil @@instance.
|
|
230
|
+
#
|
|
231
|
+
# If the requested target archive (a specific hash) is missing from the
|
|
232
|
+
# config bucket — which happens in dev setups where every `openc3.sh start`
|
|
233
|
+
# regenerates the target archive with a fresh timestamp-appended gem
|
|
234
|
+
# version — we fall back to 'current' and record a warning on the job so
|
|
235
|
+
# the UI can surface it. This matters because the old historical archive
|
|
236
|
+
# the log file references may no longer exist.
|
|
237
|
+
def ingest_all_groups(job, groups, target)
|
|
238
|
+
packets_written = 0
|
|
239
|
+
last_status_at = 0
|
|
240
|
+
warnings = (job.warnings || []).dup
|
|
241
|
+
groups.each do |version, files|
|
|
242
|
+
resolved = load_system_with_fallback(target, version, warnings)
|
|
243
|
+
unless resolved
|
|
244
|
+
# Even the 'current' fallback failed; skip this group rather than
|
|
245
|
+
# publish empty json_data for every packet.
|
|
246
|
+
mark(job, warnings: warnings)
|
|
247
|
+
next
|
|
248
|
+
end
|
|
249
|
+
mark(job, warnings: warnings) if warnings.any?
|
|
250
|
+
files.each do |file|
|
|
251
|
+
packets_written, last_status_at = ingest_file(job, file, packets_written, last_status_at)
|
|
252
|
+
end
|
|
253
|
+
end
|
|
254
|
+
mark(job, packets_written: packets_written, warnings: warnings)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
# Returns the target_version that was actually loaded, or nil if even the
|
|
258
|
+
# 'current' fallback failed. Appends human-readable entries to `warnings`
|
|
259
|
+
# for any fallback or failure.
|
|
260
|
+
def load_system_with_fallback(target, version, warnings)
|
|
261
|
+
begin
|
|
262
|
+
load_system(target, version)
|
|
263
|
+
return version
|
|
264
|
+
rescue => e
|
|
265
|
+
if version == 'current'
|
|
266
|
+
# Caller explicitly requested 'current' and that failed; no further
|
|
267
|
+
# fallback exists — propagate so the outer rescue marks Crashed.
|
|
268
|
+
raise
|
|
269
|
+
end
|
|
270
|
+
@logger.warn("Reingest job #{@job_id}: target archive for #{target} version '#{version}' unavailable (#{e.class}: #{e.message}); falling back to 'current'")
|
|
271
|
+
warnings << "Version '#{version}' archive missing; used 'current' instead"
|
|
272
|
+
end
|
|
273
|
+
|
|
274
|
+
begin
|
|
275
|
+
load_system(target, 'current')
|
|
276
|
+
'current'
|
|
277
|
+
rescue => e
|
|
278
|
+
@logger.error("Reingest job #{@job_id}: fallback to 'current' also failed: #{e.class}: #{e.message}")
|
|
279
|
+
warnings << "Version '#{version}' archive missing and 'current' also failed (#{e.message})"
|
|
280
|
+
nil
|
|
281
|
+
end
|
|
282
|
+
end
|
|
283
|
+
|
|
284
|
+
def load_system(target, version)
|
|
285
|
+
System.reset_instance!
|
|
286
|
+
System.setup_targets([target], Dir.tmpdir, scope: @scope, target_version: version)
|
|
287
|
+
end
|
|
288
|
+
|
|
289
|
+
def ingest_file(job, local_file, packets_written, last_status_at)
|
|
290
|
+
reader = PacketLogReader.new
|
|
291
|
+
reader.each(local_file, true) do |packet|
|
|
292
|
+
next unless packet.target_name && packet.packet_name
|
|
293
|
+
packet.stored = true
|
|
294
|
+
DecomCommon.decom_and_publish(
|
|
295
|
+
packet,
|
|
296
|
+
scope: @scope,
|
|
297
|
+
target_names: [packet.target_name],
|
|
298
|
+
logger: @logger,
|
|
299
|
+
name: "REINGEST:#{@job_id}",
|
|
300
|
+
check_limits: false,
|
|
301
|
+
)
|
|
302
|
+
packets_written += 1
|
|
303
|
+
if packets_written - last_status_at >= STATUS_UPDATE_EVERY
|
|
304
|
+
mark(job, packets_written: packets_written)
|
|
305
|
+
last_status_at = packets_written
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
[packets_written, last_status_at]
|
|
309
|
+
end
|
|
310
|
+
|
|
311
|
+
# Returns [enabled_by_us, preexisting]. Only tables we enable are recorded
|
|
312
|
+
# in enabled_by_us; pre-existing DEDUP tables are left untouched on teardown.
|
|
313
|
+
def enable_dedup(job, table_names, db_shard)
|
|
314
|
+
enabled_by_us = []
|
|
315
|
+
preexisting = []
|
|
316
|
+
conn = QuestDBClient.connection(db_shard: db_shard)
|
|
317
|
+
table_names.each_with_index do |table_name, i|
|
|
318
|
+
begin
|
|
319
|
+
already = dedup_already_enabled?(conn, table_name)
|
|
320
|
+
if already
|
|
321
|
+
preexisting << table_name
|
|
322
|
+
else
|
|
323
|
+
conn.exec("ALTER TABLE '#{table_name}' DEDUP ENABLE UPSERT KEYS(PACKET_TIMESECONDS)")
|
|
324
|
+
enabled_by_us << table_name
|
|
325
|
+
end
|
|
326
|
+
rescue => e
|
|
327
|
+
@logger.warn("Failed to enable DEDUP on #{table_name}: #{e.message}")
|
|
328
|
+
end
|
|
329
|
+
mark(job, progress_current: i + 1)
|
|
330
|
+
end
|
|
331
|
+
[enabled_by_us, preexisting]
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
# QuestDB exposes per-table dedup status via tables() function.
|
|
335
|
+
# Falls back to false (treat as not-enabled, will issue ALTER) on any error.
|
|
336
|
+
def dedup_already_enabled?(conn, table_name)
|
|
337
|
+
result = conn.exec_params(
|
|
338
|
+
"SELECT dedup FROM tables() WHERE table_name = $1",
|
|
339
|
+
[table_name],
|
|
340
|
+
)
|
|
341
|
+
return false if result.ntuples == 0
|
|
342
|
+
value = result[0]['dedup']
|
|
343
|
+
value == true || value == 't' || value.to_s.downcase == 'true'
|
|
344
|
+
rescue => e
|
|
345
|
+
@logger.warn("Could not query DEDUP status for #{table_name}: #{e.message}")
|
|
346
|
+
false
|
|
347
|
+
end
|
|
348
|
+
|
|
349
|
+
# Sleep dedup_cooldown_seconds, ticking the heartbeat so the stale-check
|
|
350
|
+
# doesn't misfire during the wait. This gives the Python TsdbMicroservice
|
|
351
|
+
# and QuestDB WAL time to commit reingested rows while DEDUP is still on.
|
|
352
|
+
def cooldown(job)
|
|
353
|
+
remaining = @dedup_cooldown_seconds
|
|
354
|
+
while remaining > 0
|
|
355
|
+
step = [HEARTBEAT_INTERVAL_SEC, remaining].min
|
|
356
|
+
sleep(step)
|
|
357
|
+
remaining -= step
|
|
358
|
+
mark(job) # heartbeat only
|
|
359
|
+
end
|
|
360
|
+
end
|
|
361
|
+
|
|
362
|
+
def disable_dedup(job, tables, db_shard)
|
|
363
|
+
disabled = []
|
|
364
|
+
conn = QuestDBClient.connection(db_shard: db_shard)
|
|
365
|
+
tables.each_with_index do |table_name, i|
|
|
366
|
+
begin
|
|
367
|
+
conn.exec("ALTER TABLE '#{table_name}' DEDUP DISABLE")
|
|
368
|
+
disabled << table_name
|
|
369
|
+
rescue => e
|
|
370
|
+
@logger.warn("Failed to disable DEDUP on #{table_name}: #{e.message}")
|
|
371
|
+
end
|
|
372
|
+
mark(job, progress_current: i + 1, progress_total: tables.length)
|
|
373
|
+
end
|
|
374
|
+
disabled
|
|
375
|
+
end
|
|
376
|
+
end
|
|
377
|
+
end
|