schema-tools 1.0.9 → 1.0.10
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +46 -1
- data/lib/schema_tools/schema_files.rb +15 -0
- data/lib/schema_tools/seed.rb +34 -25
- data/lib/seeder/base_doc_seeder.rb +7 -0
- data/lib/seeder/custom_doc_seeder.rb +15 -0
- data/lib/seeder/mappings_doc_seeder.rb +451 -0
- data/lib/seeder/sample_doc_seeder.rb +20 -0
- data/lib/seeder/seeder.rb +84 -506
- metadata +6 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: ace88e188ea3e99453282c5b8a4f3160aabd50a1791b44adbbbbaa96ec2d9801
|
4
|
+
data.tar.gz: c4cb7b090362308b92c9e8f857110d7d104eef7907649825669fc46cc34f567b
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9d69414e92b731a19f47c98cb97a0cb8c6f2994be31f61930d7fcab7cc60272bd4c5af2790cbdd5eae7061a737eef744f7ab643d58c0647dfb3b166ad1f8d2b3
|
7
|
+
data.tar.gz: 2441ceb2f6d920aaf4e33b1116978f014e093bd433d4dc2ee3206b79d06c5dd43353de449f3fb7cc35a941a104eacdd8497e4601f246e4d2db5eaae79bb4a03b
|
data/README.md
CHANGED
@@ -134,10 +134,55 @@ schemas/users
|
|
134
134
|
|
135
135
|
Each schema folder name matches the name of an alias.
|
136
136
|
|
137
|
-
##
|
137
|
+
## Seed sample data
|
138
138
|
|
139
139
|
Use `rake schema:seed` to seed an index with sample documents that conform to your schema.
|
140
140
|
|
141
|
+
The seeder can generate sample docs for an index 3 ways:
|
142
|
+
|
143
|
+
1. (Default) Mappings-based seeder
|
144
|
+
|
145
|
+
The seeder generates random data that conforms to the index's mappings.
|
146
|
+
|
147
|
+
2. Sample-based seeder
|
148
|
+
|
149
|
+
Add a `sample_docs.json` file in the schema folder with example docs to randomly select from when seeding:
|
150
|
+
|
151
|
+
```json
|
152
|
+
{
|
153
|
+
"hits": [
|
154
|
+
{
|
155
|
+
"_source": {
|
156
|
+
"title": "Foo",
|
157
|
+
"desc": "Bar"
|
158
|
+
}
|
159
|
+
},
|
160
|
+
...
|
161
|
+
]
|
162
|
+
}
|
163
|
+
```
|
164
|
+
|
165
|
+
3. Custom document seeder
|
166
|
+
|
167
|
+
Add a `doc_seeder.rb` file in the schema folder with a class DocSeeder
|
168
|
+
|
169
|
+
```ruby
|
170
|
+
# schema:seed invokes this class when seeding test data for this index
|
171
|
+
class DocSeeder
|
172
|
+
def initialize(index_or_alias_name) end
|
173
|
+
def generate_document
|
174
|
+
return {
|
175
|
+
'title' => 'Foo',
|
176
|
+
'desc' => 'Bar'
|
177
|
+
}
|
178
|
+
end
|
179
|
+
end
|
180
|
+
```
|
181
|
+
|
182
|
+
The seeder first looks for a Custom document seeder. If none found, it falls back to a Sample seeder. If no sample documents found, it falls back to a Mappings seeder.
|
183
|
+
|
184
|
+
## Other settings and tasks
|
185
|
+
|
141
186
|
Use `DRYRUN` to simulate but not apply any POST/PUT/DELETE operations to your index:
|
142
187
|
|
143
188
|
```
|
@@ -23,6 +23,21 @@ module SchemaTools
|
|
23
23
|
File.exist?(script_path) ? File.read(script_path) : nil
|
24
24
|
end
|
25
25
|
|
26
|
+
def self.get_sample_docs(alias_name)
|
27
|
+
sample_docs_path = File.join(Config.schemas_path, alias_name, 'sample_docs.json')
|
28
|
+
return nil unless File.exist?(sample_docs_path)
|
29
|
+
|
30
|
+
JSON.parse(File.read(sample_docs_path))
|
31
|
+
end
|
32
|
+
|
33
|
+
def self.get_doc_seeder_class(alias_name)
|
34
|
+
seeder_path = File.join(Config.schemas_path, alias_name, 'doc_seeder.rb')
|
35
|
+
|
36
|
+
return nil unless File.exist?(seeder_path)
|
37
|
+
require(File.expand_path(seeder_path))
|
38
|
+
return DocSeeder
|
39
|
+
end
|
40
|
+
|
26
41
|
def self.discover_all_schemas
|
27
42
|
return [] unless Dir.exist?(Config.schemas_path)
|
28
43
|
|
data/lib/schema_tools/seed.rb
CHANGED
@@ -1,21 +1,43 @@
|
|
1
1
|
module SchemaTools
|
2
2
|
def self.seed(client:)
|
3
|
-
# List available indices (connection already validated during client initialization)
|
3
|
+
# List available indices and aliases (connection already validated during client initialization)
|
4
4
|
puts "Connecting to #{Config.connection_url}..."
|
5
|
+
aliases = client.list_aliases
|
5
6
|
indices = client.list_indices
|
6
7
|
|
7
|
-
|
8
|
-
|
8
|
+
single_aliases = aliases.select { |alias_name, indices| indices.length == 1 && !alias_name.start_with?('.') }
|
9
|
+
unaliased_indices = indices.reject { |index| aliases.values.flatten.include?(index) || index.start_with?('.') || client.index_closed?(index) }
|
10
|
+
|
11
|
+
# Create a combined list with sequential numbering
|
12
|
+
options = []
|
13
|
+
|
14
|
+
if single_aliases.empty? && unaliased_indices.empty?
|
15
|
+
puts "No indices or aliases found in the cluster."
|
9
16
|
puts "Please create an index first."
|
10
17
|
exit 0
|
11
18
|
end
|
12
19
|
|
13
|
-
puts "Available indices:"
|
14
|
-
|
15
|
-
|
20
|
+
puts "Available indices and aliases:"
|
21
|
+
|
22
|
+
# Show aliases first
|
23
|
+
if single_aliases.any?
|
24
|
+
single_aliases.each do |alias_name, indices|
|
25
|
+
option_number = options.length + 1
|
26
|
+
options << { type: :alias, name: alias_name, index: indices.first }
|
27
|
+
puts "#{option_number}. #{alias_name} -> #{indices.first}"
|
28
|
+
end
|
29
|
+
end
|
30
|
+
|
31
|
+
# Show unaliased indices
|
32
|
+
if unaliased_indices.any?
|
33
|
+
unaliased_indices.each do |index_name|
|
34
|
+
option_number = options.length + 1
|
35
|
+
options << { type: :index, name: index_name, index: index_name }
|
36
|
+
puts "#{option_number}. #{index_name}"
|
37
|
+
end
|
16
38
|
end
|
17
39
|
|
18
|
-
puts "\nPlease select an index by number (1-#{
|
40
|
+
puts "\nPlease select an index or alias by number (1-#{options.length}):"
|
19
41
|
selection_input = STDIN.gets&.chomp
|
20
42
|
if selection_input.nil?
|
21
43
|
puts "No input provided. Exiting."
|
@@ -23,24 +45,13 @@ module SchemaTools
|
|
23
45
|
end
|
24
46
|
selection = selection_input.to_i
|
25
47
|
|
26
|
-
if selection < 1 || selection >
|
48
|
+
if selection < 1 || selection > options.length
|
27
49
|
puts "Invalid selection. Please run the task again and select a valid number."
|
28
50
|
exit 1
|
29
51
|
end
|
30
52
|
|
31
|
-
|
32
|
-
puts "Selected
|
33
|
-
|
34
|
-
# Fetch the mappings for the selected index
|
35
|
-
puts "Fetching mappings for #{selected_index}..."
|
36
|
-
mappings = client.get_index_mappings(selected_index)
|
37
|
-
|
38
|
-
if mappings.nil?
|
39
|
-
puts "Failed to fetch mappings for #{selected_index}"
|
40
|
-
exit 1
|
41
|
-
end
|
42
|
-
|
43
|
-
puts "Mappings fetched successfully."
|
53
|
+
selected_option = options[selection - 1]
|
54
|
+
puts "Selected #{selected_option[:type]}: #{selected_option[:name]}"
|
44
55
|
|
45
56
|
# Prompt user for number of documents to seed
|
46
57
|
puts "\nHow many documents would you like to seed?"
|
@@ -56,9 +67,7 @@ module SchemaTools
|
|
56
67
|
exit 1
|
57
68
|
end
|
58
69
|
|
59
|
-
|
60
|
-
|
61
|
-
# Call the seeding function
|
62
|
-
Seed.seed_data(num_docs, mappings, client, selected_index)
|
70
|
+
seeder = Seeder::Seeder.new(index_or_alias_name: selected_option[:name], client: client)
|
71
|
+
seeder.seed(num_docs: num_docs, batch_size: 5)
|
63
72
|
end
|
64
73
|
end
|
@@ -0,0 +1,15 @@
|
|
1
|
+
module SchemaTools::Seeder
|
2
|
+
# To add a custom document seeder for a schema, add a file called
|
3
|
+
# schemas/{alias_name}/doc_seeder.rb with a class DocSeeder that extends from CustomDocSeeder
|
4
|
+
class CustomDocSeeder < BaseDocSeeder
|
5
|
+
attr_reader :index_or_alias_name
|
6
|
+
|
7
|
+
def initialize(index_or_alias_name)
|
8
|
+
@index_or_alias_name = index_or_alias_name
|
9
|
+
end
|
10
|
+
|
11
|
+
def generate_document
|
12
|
+
raise NotImplementedError, "Subclasses must implement #generate_document"
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
@@ -0,0 +1,451 @@
|
|
1
|
+
require 'securerandom'
|
2
|
+
|
3
|
+
module SchemaTools::Seeder
|
4
|
+
# Create a seed document by generating random values of correct types for an index mappings
|
5
|
+
class MappingsDocSeeder < BaseDocSeeder
|
6
|
+
|
7
|
+
# mappings: OpenSearch/Elasticsearch index mappings
|
8
|
+
def initialize(mappings)
|
9
|
+
@mappings = mappings
|
10
|
+
end
|
11
|
+
|
12
|
+
def generate_document()
|
13
|
+
document = {}
|
14
|
+
|
15
|
+
(@mappings.dig('properties') || {}).each do |field_name, field_config|
|
16
|
+
value = self.class.generate_field_value(field_config)
|
17
|
+
# Skip fields that return nil (like alias fields)
|
18
|
+
document[field_name] = value unless value.nil?
|
19
|
+
end
|
20
|
+
|
21
|
+
document
|
22
|
+
end
|
23
|
+
|
24
|
+
def self.generate_field_value(field_config)
|
25
|
+
field_type = field_config['type']
|
26
|
+
|
27
|
+
case field_type
|
28
|
+
when 'text'
|
29
|
+
generate_text_value
|
30
|
+
when 'keyword'
|
31
|
+
generate_keyword_value
|
32
|
+
when 'long', 'integer'
|
33
|
+
generate_integer_value
|
34
|
+
when 'short'
|
35
|
+
generate_short_value
|
36
|
+
when 'float', 'double'
|
37
|
+
generate_float_value
|
38
|
+
when 'boolean'
|
39
|
+
generate_boolean_value
|
40
|
+
when 'date'
|
41
|
+
generate_date_value(field_config['format'])
|
42
|
+
when 'object'
|
43
|
+
generate_object_value(field_config['properties'])
|
44
|
+
when 'nested'
|
45
|
+
generate_nested_value(field_config['properties'])
|
46
|
+
when 'rank_features'
|
47
|
+
generate_rank_features_value
|
48
|
+
when 'completion'
|
49
|
+
generate_completion_value
|
50
|
+
when 'search_as_you_type'
|
51
|
+
generate_search_as_you_type_value
|
52
|
+
when 'token_count'
|
53
|
+
generate_token_count_value
|
54
|
+
when 'alias'
|
55
|
+
# Skip alias fields - they point to other fields
|
56
|
+
nil
|
57
|
+
when 'byte'
|
58
|
+
generate_byte_value
|
59
|
+
when 'half_float'
|
60
|
+
generate_half_float_value
|
61
|
+
when 'scaled_float'
|
62
|
+
generate_scaled_float_value
|
63
|
+
when 'unsigned_long'
|
64
|
+
generate_unsigned_long_value
|
65
|
+
when 'date_nanos'
|
66
|
+
generate_date_nanos_value
|
67
|
+
when 'wildcard'
|
68
|
+
generate_wildcard_value
|
69
|
+
when 'constant_keyword'
|
70
|
+
generate_constant_keyword_value
|
71
|
+
when 'geo_shape'
|
72
|
+
generate_geo_shape_value
|
73
|
+
when 'date_range'
|
74
|
+
generate_date_range_value
|
75
|
+
when 'integer_range'
|
76
|
+
generate_integer_range_value
|
77
|
+
when 'float_range'
|
78
|
+
generate_float_range_value
|
79
|
+
when 'long_range'
|
80
|
+
generate_long_range_value
|
81
|
+
when 'double_range'
|
82
|
+
generate_double_range_value
|
83
|
+
when 'ip_range'
|
84
|
+
generate_ip_range_value
|
85
|
+
when 'geo_point'
|
86
|
+
generate_geo_point_value
|
87
|
+
when 'ip'
|
88
|
+
generate_ip_value
|
89
|
+
when 'binary'
|
90
|
+
generate_binary_value
|
91
|
+
else
|
92
|
+
# Default to keyword for unknown types
|
93
|
+
generate_keyword_value
|
94
|
+
end
|
95
|
+
end
|
96
|
+
|
97
|
+
def self.generate_text_value
|
98
|
+
# Generate a paragraph of 10-50 words
|
99
|
+
word_count = rand(10..50)
|
100
|
+
word_count.times.map { dictionary_words.sample }.join(' ')
|
101
|
+
end
|
102
|
+
|
103
|
+
def self.generate_keyword_value
|
104
|
+
# Generate a short phrase or single word
|
105
|
+
case rand(1..4)
|
106
|
+
when 1
|
107
|
+
dictionary_words.sample
|
108
|
+
when 2
|
109
|
+
"#{dictionary_words.sample}_#{rand(1000..9999)}"
|
110
|
+
when 3
|
111
|
+
"#{dictionary_words.sample} #{dictionary_words.sample}"
|
112
|
+
when 4
|
113
|
+
"#{dictionary_words.sample}-#{dictionary_words.sample}"
|
114
|
+
end
|
115
|
+
end
|
116
|
+
|
117
|
+
def self.generate_integer_value
|
118
|
+
# Generate reasonable integer values based on common use cases
|
119
|
+
case rand(1..5)
|
120
|
+
when 1
|
121
|
+
rand(1..1000) # Small positive numbers
|
122
|
+
when 2
|
123
|
+
rand(1_000_000..999_999_999) # Large IDs
|
124
|
+
when 3
|
125
|
+
rand(-100..100) # Small range including negatives
|
126
|
+
when 4
|
127
|
+
rand(1..100) # Percentages/scores
|
128
|
+
when 5
|
129
|
+
rand(1..365) # Days/periods
|
130
|
+
end
|
131
|
+
end
|
132
|
+
|
133
|
+
def self.generate_short_value
|
134
|
+
# Generate short values within Java short range (-32,768 to 32,767)
|
135
|
+
case rand(1..3)
|
136
|
+
when 1
|
137
|
+
rand(1..100) # Small positive numbers (common for ratings, counts)
|
138
|
+
when 2
|
139
|
+
rand(-100..100) # Small range including negatives
|
140
|
+
when 3
|
141
|
+
rand(1..10) # Very small numbers (ratings, flags)
|
142
|
+
end
|
143
|
+
end
|
144
|
+
|
145
|
+
def self.generate_float_value
|
146
|
+
# Generate decimal numbers
|
147
|
+
case rand(1..3)
|
148
|
+
when 1
|
149
|
+
(rand * 100).round(2) # 0-100 with 2 decimal places
|
150
|
+
when 2
|
151
|
+
(rand * 1000).round(4) # 0-1000 with 4 decimal places
|
152
|
+
when 3
|
153
|
+
(rand * 10 - 5).round(3) # -5 to 5 with 3 decimal places
|
154
|
+
end
|
155
|
+
end
|
156
|
+
|
157
|
+
def self.generate_boolean_value
|
158
|
+
[true, false].sample
|
159
|
+
end
|
160
|
+
|
161
|
+
def self.generate_date_value(format = nil)
|
162
|
+
# Generate a random date within the last year
|
163
|
+
start_time = Time.now - (365 * 24 * 60 * 60) # one year ago
|
164
|
+
random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
|
165
|
+
|
166
|
+
case format
|
167
|
+
when 'epoch_millis'
|
168
|
+
(random_time.to_f * 1000).to_i
|
169
|
+
when 'epoch_second'
|
170
|
+
random_time.to_i
|
171
|
+
when 'yyyy-MM-dd'
|
172
|
+
random_time.strftime('%Y-%m-%d')
|
173
|
+
when 'yyyy-MM-dd HH:mm:ss'
|
174
|
+
random_time.strftime('%Y-%m-%d %H:%M:%S')
|
175
|
+
when 'MM/dd/yyyy'
|
176
|
+
random_time.strftime('%m/%d/%Y')
|
177
|
+
when 'dd-MM-yyyy'
|
178
|
+
random_time.strftime('%d-%m-%Y')
|
179
|
+
else
|
180
|
+
# Default to ISO 8601 format
|
181
|
+
random_time.iso8601
|
182
|
+
end
|
183
|
+
end
|
184
|
+
|
185
|
+
def self.generate_object_value(properties)
|
186
|
+
return {} unless properties
|
187
|
+
|
188
|
+
object = {}
|
189
|
+
properties.each do |nested_field_name, nested_field_config|
|
190
|
+
# If a field has properties but no explicit type, it's an object
|
191
|
+
field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
|
192
|
+
|
193
|
+
parsed_config = {
|
194
|
+
'type' => field_type,
|
195
|
+
'properties' => nested_field_config['properties'],
|
196
|
+
'format' => nested_field_config['format']
|
197
|
+
}
|
198
|
+
object[nested_field_name] = generate_field_value(parsed_config)
|
199
|
+
end
|
200
|
+
object
|
201
|
+
end
|
202
|
+
|
203
|
+
def self.generate_nested_value(properties)
|
204
|
+
return [] unless properties
|
205
|
+
|
206
|
+
# Generate 1-3 nested objects
|
207
|
+
count = rand(1..3)
|
208
|
+
count.times.map do
|
209
|
+
object = {}
|
210
|
+
properties.each do |nested_field_name, nested_field_config|
|
211
|
+
# If a field has properties but no explicit type, it's an object
|
212
|
+
field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
|
213
|
+
|
214
|
+
parsed_config = {
|
215
|
+
type: field_type,
|
216
|
+
properties: nested_field_config['properties'],
|
217
|
+
format: nested_field_config['format']
|
218
|
+
}
|
219
|
+
object[nested_field_name] = generate_field_value(parsed_config)
|
220
|
+
end
|
221
|
+
object
|
222
|
+
end
|
223
|
+
end
|
224
|
+
|
225
|
+
def self.generate_rank_features_value
|
226
|
+
# Generate a rank_features object with random feature names and scores
|
227
|
+
# OpenSearch requires positive normal floats with minimum value of 1.17549435E-38
|
228
|
+
feature_count = rand(3..8)
|
229
|
+
features = {}
|
230
|
+
|
231
|
+
feature_count.times do
|
232
|
+
feature_name = "#{dictionary_words.sample}_#{rand(100..999)}"
|
233
|
+
# Generate values between 1.0e-30 and 1.0 to ensure positive normal floats
|
234
|
+
# Use a higher minimum to avoid floating-point precision issues
|
235
|
+
min_value = 1.0e-30 # Much higher than the OpenSearch minimum
|
236
|
+
value = rand(min_value..1.0).round(4)
|
237
|
+
# Ensure we never get exactly 0.0 due to floating-point precision
|
238
|
+
value = [value, 1.0e-30].max
|
239
|
+
features[feature_name] = value
|
240
|
+
end
|
241
|
+
|
242
|
+
features
|
243
|
+
end
|
244
|
+
|
245
|
+
def self.generate_geo_point_value
|
246
|
+
# Generate random latitude/longitude coordinates
|
247
|
+
{
|
248
|
+
lat: (rand * 180 - 90).round(6), # -90 to 90
|
249
|
+
lon: (rand * 360 - 180).round(6) # -180 to 180
|
250
|
+
}
|
251
|
+
end
|
252
|
+
|
253
|
+
def self.generate_ip_value
|
254
|
+
# Generate random IP addresses
|
255
|
+
case rand(1..2)
|
256
|
+
when 1
|
257
|
+
# IPv4
|
258
|
+
"#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
|
259
|
+
when 2
|
260
|
+
# IPv6 (simplified)
|
261
|
+
"2001:db8::#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}"
|
262
|
+
end
|
263
|
+
end
|
264
|
+
|
265
|
+
def self.generate_binary_value
|
266
|
+
# Generate base64 encoded random data
|
267
|
+
require 'base64'
|
268
|
+
random_bytes = (0...32).map { rand(256) }.pack('C*')
|
269
|
+
Base64.encode64(random_bytes).strip
|
270
|
+
end
|
271
|
+
|
272
|
+
def self.generate_completion_value
|
273
|
+
# Generate completion suggestions
|
274
|
+
{
|
275
|
+
'input' => [dictionary_words.sample, "#{dictionary_words.sample} #{dictionary_words.sample}"],
|
276
|
+
'weight' => rand(1..100)
|
277
|
+
}
|
278
|
+
end
|
279
|
+
|
280
|
+
def self.generate_search_as_you_type_value
|
281
|
+
# Generate search-as-you-type text
|
282
|
+
"#{dictionary_words.sample} #{dictionary_words.sample} #{dictionary_words.sample}"
|
283
|
+
end
|
284
|
+
|
285
|
+
def self.generate_token_count_value
|
286
|
+
# Generate token count (integer representing number of tokens)
|
287
|
+
rand(1..50)
|
288
|
+
end
|
289
|
+
|
290
|
+
def self.generate_byte_value
|
291
|
+
# Generate byte values (-128 to 127)
|
292
|
+
rand(-128..127)
|
293
|
+
end
|
294
|
+
|
295
|
+
def self.generate_half_float_value
|
296
|
+
# Generate half-float values (smaller range than regular float)
|
297
|
+
(rand * 100 - 50).round(2)
|
298
|
+
end
|
299
|
+
|
300
|
+
def self.generate_scaled_float_value
|
301
|
+
# Generate scaled float values (multiplied by scaling factor)
|
302
|
+
(rand * 100).round(2)
|
303
|
+
end
|
304
|
+
|
305
|
+
def self.generate_unsigned_long_value
|
306
|
+
# Generate unsigned long values (0 to 2^64-1, but keep reasonable)
|
307
|
+
rand(0..999_999_999)
|
308
|
+
end
|
309
|
+
|
310
|
+
def self.generate_date_nanos_value
|
311
|
+
# Generate date with nanosecond precision
|
312
|
+
start_time = Time.now - (365 * 24 * 60 * 60)
|
313
|
+
random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
|
314
|
+
random_time.iso8601(9) # Include nanoseconds
|
315
|
+
end
|
316
|
+
|
317
|
+
def self.generate_wildcard_value
|
318
|
+
# Generate wildcard text (similar to keyword but optimized for wildcard queries)
|
319
|
+
"#{dictionary_words.sample}_#{rand(1000..9999)}"
|
320
|
+
end
|
321
|
+
|
322
|
+
def self.generate_constant_keyword_value
|
323
|
+
# Generate constant keyword (always the same value)
|
324
|
+
"constant_value"
|
325
|
+
end
|
326
|
+
|
327
|
+
def self.generate_geo_shape_value
|
328
|
+
# Generate simple geo shapes (point)
|
329
|
+
{
|
330
|
+
'type' => "point",
|
331
|
+
'coordinates' => [rand(-180.0..180.0).round(6), rand(-90.0..90.0).round(6)]
|
332
|
+
}
|
333
|
+
end
|
334
|
+
|
335
|
+
def self.generate_date_range_value
|
336
|
+
# Generate date range
|
337
|
+
start_date = Time.now - (365 * 24 * 60 * 60)
|
338
|
+
end_date = Time.now
|
339
|
+
{
|
340
|
+
'gte' => start_date.iso8601,
|
341
|
+
'lte' => end_date.iso8601
|
342
|
+
}
|
343
|
+
end
|
344
|
+
|
345
|
+
def self.generate_integer_range_value
|
346
|
+
# Generate integer range
|
347
|
+
start_val = rand(-1000..1000)
|
348
|
+
end_val = start_val + rand(1..1000)
|
349
|
+
{
|
350
|
+
'gte' => start_val,
|
351
|
+
'lte' => end_val
|
352
|
+
}
|
353
|
+
end
|
354
|
+
|
355
|
+
def self.generate_float_range_value
|
356
|
+
# Generate float range
|
357
|
+
start_val = (rand * 100 - 50).round(2)
|
358
|
+
end_val = start_val + (rand * 100).round(2)
|
359
|
+
{
|
360
|
+
'gte' => start_val,
|
361
|
+
'lte' => end_val
|
362
|
+
}
|
363
|
+
end
|
364
|
+
|
365
|
+
def self.generate_long_range_value
|
366
|
+
# Generate long range
|
367
|
+
start_val = rand(-1_000_000..1_000_000)
|
368
|
+
end_val = start_val + rand(1..1_000_000)
|
369
|
+
{
|
370
|
+
'gte' => start_val,
|
371
|
+
'lte' => end_val
|
372
|
+
}
|
373
|
+
end
|
374
|
+
|
375
|
+
def self.generate_double_range_value
|
376
|
+
# Generate double range
|
377
|
+
start_val = (rand * 1000 - 500).round(4)
|
378
|
+
end_val = start_val + (rand * 1000).round(4)
|
379
|
+
{
|
380
|
+
'gte' => start_val,
|
381
|
+
'lte' => end_val
|
382
|
+
}
|
383
|
+
end
|
384
|
+
|
385
|
+
def self.generate_ip_range_value
|
386
|
+
# Generate IP range with proper ordering
|
387
|
+
# Generate a base IP and add a small range to it
|
388
|
+
base_ip = "#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
|
389
|
+
|
390
|
+
# Parse the last octet and create a small range
|
391
|
+
parts = base_ip.split('.')
|
392
|
+
last_octet = parts[3].to_i
|
393
|
+
start_last = [last_octet, 254].min
|
394
|
+
end_last = [start_last + rand(1..10), 254].min
|
395
|
+
|
396
|
+
start_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{start_last}"
|
397
|
+
end_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{end_last}"
|
398
|
+
|
399
|
+
{
|
400
|
+
'gte' => start_ip,
|
401
|
+
'lte' => end_ip
|
402
|
+
}
|
403
|
+
end
|
404
|
+
|
405
|
+
def self.dictionary_words
|
406
|
+
@dictionary_words ||= begin
|
407
|
+
File.readlines('/usr/share/dict/words')
|
408
|
+
.map(&:chomp)
|
409
|
+
.select { |w| w.length.between?(3, 10) }
|
410
|
+
rescue Errno::ENOENT
|
411
|
+
%w[lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor
|
412
|
+
incididunt ut labore et dolore magna aliqua enim ad minim veniam quis nostrud
|
413
|
+
exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat duis aute
|
414
|
+
irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat
|
415
|
+
nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa
|
416
|
+
qui officia deserunt mollit anim id est laborum search engine data ruby
|
417
|
+
document index mapping schema elasticsearch opensearch cluster node shard
|
418
|
+
replica primary secondary analysis tokenizer filter analyzer query filter
|
419
|
+
aggregation pipeline script painless groovy mustache template kibana
|
420
|
+
logstash beats metricbeat filebeat packetbeat heartbeat auditbeat
|
421
|
+
functionbeat winlogbeat journalbeat apm agent apm server fleet agent
|
422
|
+
policy enrollment token integration package endpoint security detection
|
423
|
+
rule machine learning anomaly detection forecasting classification
|
424
|
+
regression clustering outlier detection natural language processing
|
425
|
+
vector search semantic search neural search transformer embedding
|
426
|
+
vector database similarity search recommendation system personalization
|
427
|
+
real-time streaming batch processing event sourcing cqrs microservices
|
428
|
+
distributed system scalability performance optimization monitoring
|
429
|
+
observability logging metrics tracing alerting notification dashboard
|
430
|
+
visualization reporting analytics business intelligence data science
|
431
|
+
machine learning artificial intelligence deep learning neural network
|
432
|
+
algorithm model training inference prediction classification regression
|
433
|
+
clustering dimensionality reduction feature engineering data preprocessing
|
434
|
+
validation testing deployment production staging development environment
|
435
|
+
configuration management version control continuous integration continuous
|
436
|
+
deployment devops infrastructure as code containerization orchestration
|
437
|
+
kubernetes docker swarm mesos nomad consul etcd zookeeper redis memcached
|
438
|
+
rabbitmq kafka pulsar nats jetstream grpc rest api graphql websocket
|
439
|
+
http https tls ssl certificate authentication authorization oauth jwt
|
440
|
+
saml ldap active directory kerberos rbac abac policy enforcement
|
441
|
+
compliance governance security audit vulnerability assessment penetration
|
442
|
+
testing threat modeling risk management incident response disaster recovery
|
443
|
+
backup restore high availability fault tolerance load balancing auto-scaling
|
444
|
+
horizontal scaling vertical scaling sharding partitioning replication
|
445
|
+
consistency eventual consistency strong consistency cap theorem acid
|
446
|
+
base distributed consensus raft paxos byzantine fault tolerance
|
447
|
+
]
|
448
|
+
end
|
449
|
+
end
|
450
|
+
end
|
451
|
+
end
|
@@ -0,0 +1,20 @@
|
|
1
|
+
require 'securerandom'
|
2
|
+
require 'active_support/all'
|
3
|
+
|
4
|
+
module SchemaTools::Seeder
|
5
|
+
# Generate a document by choosing a document at random from an array of sample documents
|
6
|
+
#
|
7
|
+
# The seeder looks for sample docs in schemas/{alias_name}/sample_docs.json
|
8
|
+
# in the form: { "hits": [ { "_source": { "title": "Foo", "desc": "Bar" } }, ... ] }
|
9
|
+
class SampleDocSeeder < BaseDocSeeder
|
10
|
+
|
11
|
+
# sample_docs: Array of sample documents to pull from at random
|
12
|
+
def initialize(sample_docs)
|
13
|
+
@sample_docs = sample_docs['hits'].pluck('_source')
|
14
|
+
end
|
15
|
+
|
16
|
+
def generate_document
|
17
|
+
@sample_docs.sample
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
data/lib/seeder/seeder.rb
CHANGED
@@ -1,539 +1,117 @@
|
|
1
1
|
require 'json'
|
2
2
|
require 'time'
|
3
3
|
|
4
|
-
module
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat
|
11
|
-
nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa
|
12
|
-
qui officia deserunt mollit anim id est laborum search engine data ruby
|
13
|
-
document index mapping schema elasticsearch opensearch cluster node shard
|
14
|
-
replica primary secondary analysis tokenizer filter analyzer query filter
|
15
|
-
aggregation pipeline script painless groovy mustache template kibana
|
16
|
-
logstash beats metricbeat filebeat packetbeat heartbeat auditbeat
|
17
|
-
functionbeat winlogbeat journalbeat apm agent apm server fleet agent
|
18
|
-
policy enrollment token integration package endpoint security detection
|
19
|
-
rule machine learning anomaly detection forecasting classification
|
20
|
-
regression clustering outlier detection natural language processing
|
21
|
-
vector search semantic search neural search transformer embedding
|
22
|
-
vector database similarity search recommendation system personalization
|
23
|
-
real-time streaming batch processing event sourcing cqrs microservices
|
24
|
-
distributed system scalability performance optimization monitoring
|
25
|
-
observability logging metrics tracing alerting notification dashboard
|
26
|
-
visualization reporting analytics business intelligence data science
|
27
|
-
machine learning artificial intelligence deep learning neural network
|
28
|
-
algorithm model training inference prediction classification regression
|
29
|
-
clustering dimensionality reduction feature engineering data preprocessing
|
30
|
-
validation testing deployment production staging development environment
|
31
|
-
configuration management version control continuous integration continuous
|
32
|
-
deployment devops infrastructure as code containerization orchestration
|
33
|
-
kubernetes docker swarm mesos nomad consul etcd zookeeper redis memcached
|
34
|
-
rabbitmq kafka pulsar nats jetstream grpc rest api graphql websocket
|
35
|
-
http https tls ssl certificate authentication authorization oauth jwt
|
36
|
-
saml ldap active directory kerberos rbac abac policy enforcement
|
37
|
-
compliance governance security audit vulnerability assessment penetration
|
38
|
-
testing threat modeling risk management incident response disaster recovery
|
39
|
-
backup restore high availability fault tolerance load balancing auto-scaling
|
40
|
-
horizontal scaling vertical scaling sharding partitioning replication
|
41
|
-
consistency eventual consistency strong consistency cap theorem acid
|
42
|
-
base distributed consensus raft paxos byzantine fault tolerance
|
43
|
-
].freeze
|
44
|
-
|
45
|
-
def self.seed_data(num_docs, mappings_json, client, index_name)
|
46
|
-
puts "Seeding #{num_docs} documents to index: #{index_name}"
|
47
|
-
|
48
|
-
# Parse the mappings to understand the schema
|
49
|
-
schema = parse_mappings(mappings_json)
|
50
|
-
puts "Parsed schema with #{schema.keys.length} top-level fields"
|
51
|
-
|
52
|
-
# Generate documents in batches for efficiency
|
53
|
-
# Reduced batch size to avoid circuit breaker issues with large documents
|
54
|
-
batch_size = 25 # Reduced from 100 to 25 for large documents
|
55
|
-
total_batches = (num_docs.to_f / batch_size).ceil
|
56
|
-
|
57
|
-
(1..total_batches).each do |batch_num|
|
58
|
-
docs_in_batch = [batch_size, num_docs - (batch_num - 1) * batch_size].min
|
59
|
-
puts "Generating batch #{batch_num}/#{total_batches} (#{docs_in_batch} documents)..."
|
60
|
-
|
61
|
-
documents = generate_document_batch(docs_in_batch, schema)
|
62
|
-
|
63
|
-
puts "Indexing batch #{batch_num}..."
|
64
|
-
begin
|
65
|
-
response = client.bulk_index(documents, index_name)
|
66
|
-
|
67
|
-
# Check for errors in bulk response
|
68
|
-
if response['errors']
|
69
|
-
error_items = response['items'].select { |item| item.dig('index', 'status') >= 400 }
|
70
|
-
error_count = error_items.length
|
71
|
-
if error_count > 0
|
72
|
-
puts "WARN: #{error_count} documents failed to index in batch #{batch_num}"
|
73
|
-
|
74
|
-
# Print first few errors for debugging
|
75
|
-
error_items.first(3).each_with_index do |item, index|
|
76
|
-
error_info = item.dig('index', 'error')
|
77
|
-
if error_info
|
78
|
-
puts " Error #{index + 1}: #{error_info['type']} - #{error_info['reason']}"
|
79
|
-
if error_info['caused_by']
|
80
|
-
puts " Caused by: #{error_info['caused_by']['type']} - #{error_info['caused_by']['reason']}"
|
81
|
-
end
|
82
|
-
end
|
83
|
-
end
|
84
|
-
|
85
|
-
if error_count > 3
|
86
|
-
puts " ... and #{error_count - 3} more errors"
|
87
|
-
end
|
88
|
-
end
|
89
|
-
end
|
90
|
-
|
91
|
-
puts "Successfully indexed batch #{batch_num}"
|
92
|
-
rescue => e
|
93
|
-
if e.message.include?('circuit_breaking_exception') || e.message.include?('HTTP 429')
|
94
|
-
puts "ERROR: Circuit breaker triggered - OpenSearch cluster is out of memory"
|
95
|
-
puts "Consider:"
|
96
|
-
puts " 1. Reducing batch size further (currently #{batch_size})"
|
97
|
-
puts " 2. Increasing OpenSearch heap size"
|
98
|
-
puts " 3. Reducing document size/complexity"
|
99
|
-
puts " 4. Adding delays between batches"
|
100
|
-
puts ""
|
101
|
-
puts "Batch #{batch_num} failed: #{e.message}"
|
102
|
-
raise StandardError.new("Circuit breaker triggered - OpenSearch cluster is out of memory")
|
103
|
-
else
|
104
|
-
puts "Error indexing batch #{batch_num}: #{e.message}"
|
105
|
-
raise e
|
106
|
-
end
|
107
|
-
end
|
108
|
-
|
109
|
-
# Add a small delay between batches to help with memory pressure
|
110
|
-
sleep(0.1) if batch_num < total_batches
|
4
|
+
module SchemaTools::Seeder
|
5
|
+
class Seeder
|
6
|
+
def initialize(index_or_alias_name:, client:)
|
7
|
+
@client = client
|
8
|
+
@index_or_alias_name = index_or_alias_name
|
9
|
+
@doc_seeder = initialize_doc_seeder
|
111
10
|
end
|
112
|
-
|
113
|
-
puts "Successfully seeded #{num_docs} documents to #{index_name}"
|
114
|
-
end
|
115
11
|
|
116
|
-
|
12
|
+
def initialize_doc_seeder
|
13
|
+
custom_doc_seeder_class = SchemaTools::SchemaFiles.get_doc_seeder_class(@index_or_alias_name)
|
14
|
+
return custom_doc_seeder_class.new(@index_or_alias_name) if custom_doc_seeder_class
|
117
15
|
|
118
|
-
|
119
|
-
|
120
|
-
properties = mappings_json.dig('properties') || {}
|
121
|
-
parse_properties(properties)
|
122
|
-
end
|
16
|
+
sample_docs = SchemaTools::SchemaFiles.get_sample_docs(@index_or_alias_name)
|
17
|
+
return SampleDocSeeder.new(sample_docs) if sample_docs
|
123
18
|
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
|
128
|
-
# If a field has properties but no explicit type, it's an object
|
129
|
-
field_type = field_config['type'] || (field_config['properties'] ? 'object' : 'keyword')
|
130
|
-
|
131
|
-
schema[field_name] = {
|
132
|
-
type: field_type,
|
133
|
-
properties: field_config['properties'],
|
134
|
-
format: field_config['format']
|
135
|
-
}
|
136
|
-
end
|
137
|
-
|
138
|
-
schema
|
139
|
-
end
|
19
|
+
# Resolve alias to actual index name if needed
|
20
|
+
actual_index_name = resolve_to_index_name(@index_or_alias_name)
|
21
|
+
mappings = @client.get_index_mappings(actual_index_name)
|
22
|
+
return MappingsDocSeeder.new(mappings) if mappings
|
140
23
|
|
141
|
-
|
142
|
-
count.times.map do
|
143
|
-
generate_document(schema)
|
24
|
+
raise "No custom document seeder, sample documents, or mappings found for #{@index_or_alias_name}"
|
144
25
|
end
|
145
|
-
end
|
146
26
|
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
schema.each do |field_name, field_config|
|
151
|
-
value = generate_field_value(field_config)
|
152
|
-
# Skip fields that return nil (like alias fields)
|
153
|
-
document[field_name] = value unless value.nil?
|
154
|
-
end
|
27
|
+
def seed(num_docs:, batch_size: 5)
|
28
|
+
puts "Seeding #{num_docs} in batches of #{batch_size} documents from #{@index_or_alias_name} using #{@doc_seeder.class.name}"
|
155
29
|
|
156
|
-
|
157
|
-
|
30
|
+
total_batches = (num_docs.to_f / batch_size).ceil
|
31
|
+
total_seeded_docs = 0
|
158
32
|
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
case field_type
|
163
|
-
when 'text'
|
164
|
-
generate_text_value
|
165
|
-
when 'keyword'
|
166
|
-
generate_keyword_value
|
167
|
-
when 'long', 'integer'
|
168
|
-
generate_integer_value
|
169
|
-
when 'short'
|
170
|
-
generate_short_value
|
171
|
-
when 'float', 'double'
|
172
|
-
generate_float_value
|
173
|
-
when 'boolean'
|
174
|
-
generate_boolean_value
|
175
|
-
when 'date'
|
176
|
-
generate_date_value(field_config[:format])
|
177
|
-
when 'object'
|
178
|
-
generate_object_value(field_config[:properties])
|
179
|
-
when 'nested'
|
180
|
-
generate_nested_value(field_config[:properties])
|
181
|
-
when 'rank_features'
|
182
|
-
generate_rank_features_value
|
183
|
-
when 'completion'
|
184
|
-
generate_completion_value
|
185
|
-
when 'search_as_you_type'
|
186
|
-
generate_search_as_you_type_value
|
187
|
-
when 'token_count'
|
188
|
-
generate_token_count_value
|
189
|
-
when 'alias'
|
190
|
-
# Skip alias fields - they point to other fields
|
191
|
-
nil
|
192
|
-
when 'byte'
|
193
|
-
generate_byte_value
|
194
|
-
when 'half_float'
|
195
|
-
generate_half_float_value
|
196
|
-
when 'scaled_float'
|
197
|
-
generate_scaled_float_value
|
198
|
-
when 'unsigned_long'
|
199
|
-
generate_unsigned_long_value
|
200
|
-
when 'date_nanos'
|
201
|
-
generate_date_nanos_value
|
202
|
-
when 'wildcard'
|
203
|
-
generate_wildcard_value
|
204
|
-
when 'constant_keyword'
|
205
|
-
generate_constant_keyword_value
|
206
|
-
when 'geo_shape'
|
207
|
-
generate_geo_shape_value
|
208
|
-
when 'date_range'
|
209
|
-
generate_date_range_value
|
210
|
-
when 'integer_range'
|
211
|
-
generate_integer_range_value
|
212
|
-
when 'float_range'
|
213
|
-
generate_float_range_value
|
214
|
-
when 'long_range'
|
215
|
-
generate_long_range_value
|
216
|
-
when 'double_range'
|
217
|
-
generate_double_range_value
|
218
|
-
when 'ip_range'
|
219
|
-
generate_ip_range_value
|
220
|
-
when 'geo_point'
|
221
|
-
generate_geo_point_value
|
222
|
-
when 'ip'
|
223
|
-
generate_ip_value
|
224
|
-
when 'binary'
|
225
|
-
generate_binary_value
|
226
|
-
else
|
227
|
-
# Default to keyword for unknown types
|
228
|
-
generate_keyword_value
|
229
|
-
end
|
230
|
-
end
|
33
|
+
num_docs.times.each_slice(batch_size).with_index(1) do |batch_range, batch_num|
|
34
|
+
docs_in_batch = batch_range.size
|
231
35
|
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
end
|
36
|
+
puts "Generating batch #{batch_num}/#{total_batches} (#{docs_in_batch} documents)..."
|
37
|
+
documents = Array.new(docs_in_batch) do
|
38
|
+
@doc_seeder.generate_document
|
39
|
+
end
|
237
40
|
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
41
|
+
puts "Indexing batch #{batch_num}..."
|
42
|
+
response = bulk_index(documents)
|
43
|
+
seeded_docs = documents.length - print_errors(response)
|
44
|
+
total_seeded_docs += seeded_docs
|
45
|
+
puts "Indexed #{seeded_docs} documents for batch #{batch_num}" if seeded_docs
|
46
|
+
|
47
|
+
sleep(0.1) if batch_num < total_batches # small delay to help with memory pressure
|
48
|
+
rescue StandardError => e
|
49
|
+
puts "Batch #{batch_num} failed: #{e.message}"
|
50
|
+
handle_circuit_breaker_exception(e, batch_size)
|
51
|
+
raise e
|
52
|
+
end
|
53
|
+
puts "Seeded #{total_seeded_docs} documents to #{@index_or_alias_name}"
|
249
54
|
end
|
250
|
-
end
|
251
55
|
|
252
|
-
|
253
|
-
|
254
|
-
case rand(1..5)
|
255
|
-
when 1
|
256
|
-
rand(1..1000) # Small positive numbers
|
257
|
-
when 2
|
258
|
-
rand(1_000_000..999_999_999) # Large IDs
|
259
|
-
when 3
|
260
|
-
rand(-100..100) # Small range including negatives
|
261
|
-
when 4
|
262
|
-
rand(1..100) # Percentages/scores
|
263
|
-
when 5
|
264
|
-
rand(1..365) # Days/periods
|
56
|
+
def bulk_index(documents)
|
57
|
+
@client.bulk_index(documents, @index_or_alias_name)
|
265
58
|
end
|
266
|
-
end
|
267
59
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
60
|
+
def handle_circuit_breaker_exception(error, batch_size)
|
61
|
+
return unless error&.message&.match?(/circuit_breaking_exception|HTTP 429/)
|
62
|
+
|
63
|
+
puts 'ERROR: Circuit breaker triggered - OpenSearch cluster is out of memory'
|
64
|
+
puts 'Consider:'
|
65
|
+
puts " 1. Reducing batch size further (currently #{batch_size})"
|
66
|
+
puts ' 2. Increasing OpenSearch heap size'
|
67
|
+
puts ' 3. Reducing document size/complexity'
|
68
|
+
puts ' 4. Adding delays between batches'
|
69
|
+
puts ''
|
70
|
+
raise StandardError, 'Circuit breaker triggered - OpenSearch cluster is out of memory'
|
277
71
|
end
|
278
|
-
end
|
279
72
|
|
280
|
-
|
281
|
-
|
282
|
-
case rand(1..3)
|
283
|
-
when 1
|
284
|
-
(rand * 100).round(2) # 0-100 with 2 decimal places
|
285
|
-
when 2
|
286
|
-
(rand * 1000).round(4) # 0-1000 with 4 decimal places
|
287
|
-
when 3
|
288
|
-
(rand * 10 - 5).round(3) # -5 to 5 with 3 decimal places
|
289
|
-
end
|
290
|
-
end
|
73
|
+
def print_errors(response)
|
74
|
+
return 0 unless response['errors']
|
291
75
|
|
292
|
-
|
293
|
-
|
294
|
-
|
76
|
+
error_items = response['items'].select { |item| item.dig('index', 'status') >= 400 }
|
77
|
+
error_count = error_items.length
|
78
|
+
return 0 unless error_count.positive?
|
295
79
|
|
296
|
-
|
297
|
-
# Generate a random date within the last year
|
298
|
-
start_time = Time.now - (365 * 24 * 60 * 60) # one year ago
|
299
|
-
random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
|
300
|
-
|
301
|
-
case format
|
302
|
-
when 'epoch_millis'
|
303
|
-
(random_time.to_f * 1000).to_i
|
304
|
-
when 'epoch_second'
|
305
|
-
random_time.to_i
|
306
|
-
when 'yyyy-MM-dd'
|
307
|
-
random_time.strftime('%Y-%m-%d')
|
308
|
-
when 'yyyy-MM-dd HH:mm:ss'
|
309
|
-
random_time.strftime('%Y-%m-%d %H:%M:%S')
|
310
|
-
when 'MM/dd/yyyy'
|
311
|
-
random_time.strftime('%m/%d/%Y')
|
312
|
-
when 'dd-MM-yyyy'
|
313
|
-
random_time.strftime('%d-%m-%Y')
|
314
|
-
else
|
315
|
-
# Default to ISO 8601 format
|
316
|
-
random_time.iso8601
|
317
|
-
end
|
318
|
-
end
|
80
|
+
puts "WARN: #{error_count} documents failed to index"
|
319
81
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
properties.each do |nested_field_name, nested_field_config|
|
325
|
-
# If a field has properties but no explicit type, it's an object
|
326
|
-
field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
|
327
|
-
|
328
|
-
parsed_config = {
|
329
|
-
type: field_type,
|
330
|
-
properties: nested_field_config['properties'],
|
331
|
-
format: nested_field_config['format']
|
332
|
-
}
|
333
|
-
object[nested_field_name] = generate_field_value(parsed_config)
|
334
|
-
end
|
335
|
-
object
|
336
|
-
end
|
82
|
+
# Print first few errors for debugging
|
83
|
+
error_items.first(3).each_with_index do |item, index|
|
84
|
+
error_info = item.dig('index', 'error')
|
85
|
+
next unless error_info
|
337
86
|
|
338
|
-
|
339
|
-
return [] unless properties
|
340
|
-
|
341
|
-
# Generate 1-3 nested objects
|
342
|
-
count = rand(1..3)
|
343
|
-
count.times.map do
|
344
|
-
object = {}
|
345
|
-
properties.each do |nested_field_name, nested_field_config|
|
346
|
-
# If a field has properties but no explicit type, it's an object
|
347
|
-
field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
|
348
|
-
|
349
|
-
parsed_config = {
|
350
|
-
type: field_type,
|
351
|
-
properties: nested_field_config['properties'],
|
352
|
-
format: nested_field_config['format']
|
353
|
-
}
|
354
|
-
object[nested_field_name] = generate_field_value(parsed_config)
|
87
|
+
print_error_item(error_info, index)
|
355
88
|
end
|
356
|
-
object
|
357
|
-
end
|
358
|
-
end
|
359
89
|
|
360
|
-
|
361
|
-
|
362
|
-
# OpenSearch requires positive normal floats with minimum value of 1.17549435E-38
|
363
|
-
feature_count = rand(3..8)
|
364
|
-
features = {}
|
365
|
-
|
366
|
-
feature_count.times do
|
367
|
-
feature_name = "#{WORD_LIST.sample}_#{rand(100..999)}"
|
368
|
-
# Generate values between 1.0e-30 and 1.0 to ensure positive normal floats
|
369
|
-
# Use a higher minimum to avoid floating-point precision issues
|
370
|
-
min_value = 1.0e-30 # Much higher than the OpenSearch minimum
|
371
|
-
value = rand(min_value..1.0).round(4)
|
372
|
-
# Ensure we never get exactly 0.0 due to floating-point precision
|
373
|
-
value = [value, 1.0e-30].max
|
374
|
-
features[feature_name] = value
|
90
|
+
puts " ... and #{error_count - 3} more errors" if error_count > 3
|
91
|
+
error_count
|
375
92
|
end
|
376
|
-
|
377
|
-
features
|
378
|
-
end
|
379
93
|
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
lat: (rand * 180 - 90).round(6), # -90 to 90
|
384
|
-
lon: (rand * 360 - 180).round(6) # -180 to 180
|
385
|
-
}
|
386
|
-
end
|
94
|
+
def print_error_item(error_info, index)
|
95
|
+
puts " Error #{index + 1}: #{error_info['type']} - #{error_info['reason']}"
|
96
|
+
return unless error_info['caused_by']
|
387
97
|
|
388
|
-
|
389
|
-
# Generate random IP addresses
|
390
|
-
case rand(1..2)
|
391
|
-
when 1
|
392
|
-
# IPv4
|
393
|
-
"#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
|
394
|
-
when 2
|
395
|
-
# IPv6 (simplified)
|
396
|
-
"2001:db8::#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}"
|
98
|
+
puts " Caused by: #{error_info['caused_by']['type']} - #{error_info['caused_by']['reason']}"
|
397
99
|
end
|
398
|
-
end
|
399
|
-
|
400
|
-
def self.generate_binary_value
|
401
|
-
# Generate base64 encoded random data
|
402
|
-
require 'base64'
|
403
|
-
random_bytes = (0...32).map { rand(256) }.pack('C*')
|
404
|
-
Base64.encode64(random_bytes).strip
|
405
|
-
end
|
406
|
-
|
407
|
-
def self.generate_completion_value
|
408
|
-
# Generate completion suggestions
|
409
|
-
{
|
410
|
-
'input' => [WORD_LIST.sample, "#{WORD_LIST.sample} #{WORD_LIST.sample}"],
|
411
|
-
'weight' => rand(1..100)
|
412
|
-
}
|
413
|
-
end
|
414
|
-
|
415
|
-
def self.generate_search_as_you_type_value
|
416
|
-
# Generate search-as-you-type text
|
417
|
-
"#{WORD_LIST.sample} #{WORD_LIST.sample} #{WORD_LIST.sample}"
|
418
|
-
end
|
419
|
-
|
420
|
-
def self.generate_token_count_value
|
421
|
-
# Generate token count (integer representing number of tokens)
|
422
|
-
rand(1..50)
|
423
|
-
end
|
424
|
-
|
425
|
-
def self.generate_byte_value
|
426
|
-
# Generate byte values (-128 to 127)
|
427
|
-
rand(-128..127)
|
428
|
-
end
|
429
|
-
|
430
|
-
def self.generate_half_float_value
|
431
|
-
# Generate half-float values (smaller range than regular float)
|
432
|
-
(rand * 100 - 50).round(2)
|
433
|
-
end
|
434
|
-
|
435
|
-
def self.generate_scaled_float_value
|
436
|
-
# Generate scaled float values (multiplied by scaling factor)
|
437
|
-
(rand * 100).round(2)
|
438
|
-
end
|
439
|
-
|
440
|
-
def self.generate_unsigned_long_value
|
441
|
-
# Generate unsigned long values (0 to 2^64-1, but keep reasonable)
|
442
|
-
rand(0..999_999_999)
|
443
|
-
end
|
444
|
-
|
445
|
-
def self.generate_date_nanos_value
|
446
|
-
# Generate date with nanosecond precision
|
447
|
-
start_time = Time.now - (365 * 24 * 60 * 60)
|
448
|
-
random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
|
449
|
-
random_time.iso8601(9) # Include nanoseconds
|
450
|
-
end
|
451
100
|
|
452
|
-
|
453
|
-
# Generate wildcard text (similar to keyword but optimized for wildcard queries)
|
454
|
-
"#{WORD_LIST.sample}_#{rand(1000..9999)}"
|
455
|
-
end
|
456
|
-
|
457
|
-
def self.generate_constant_keyword_value
|
458
|
-
# Generate constant keyword (always the same value)
|
459
|
-
"constant_value"
|
460
|
-
end
|
461
|
-
|
462
|
-
def self.generate_geo_shape_value
|
463
|
-
# Generate simple geo shapes (point)
|
464
|
-
{
|
465
|
-
'type' => "point",
|
466
|
-
'coordinates' => [rand(-180.0..180.0).round(6), rand(-90.0..90.0).round(6)]
|
467
|
-
}
|
468
|
-
end
|
469
|
-
|
470
|
-
def self.generate_date_range_value
|
471
|
-
# Generate date range
|
472
|
-
start_date = Time.now - (365 * 24 * 60 * 60)
|
473
|
-
end_date = Time.now
|
474
|
-
{
|
475
|
-
'gte' => start_date.iso8601,
|
476
|
-
'lte' => end_date.iso8601
|
477
|
-
}
|
478
|
-
end
|
479
|
-
|
480
|
-
def self.generate_integer_range_value
|
481
|
-
# Generate integer range
|
482
|
-
start_val = rand(-1000..1000)
|
483
|
-
end_val = start_val + rand(1..1000)
|
484
|
-
{
|
485
|
-
'gte' => start_val,
|
486
|
-
'lte' => end_val
|
487
|
-
}
|
488
|
-
end
|
489
|
-
|
490
|
-
def self.generate_float_range_value
|
491
|
-
# Generate float range
|
492
|
-
start_val = (rand * 100 - 50).round(2)
|
493
|
-
end_val = start_val + (rand * 100).round(2)
|
494
|
-
{
|
495
|
-
'gte' => start_val,
|
496
|
-
'lte' => end_val
|
497
|
-
}
|
498
|
-
end
|
101
|
+
private
|
499
102
|
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
508
|
-
|
509
|
-
|
510
|
-
|
511
|
-
|
512
|
-
|
513
|
-
end_val = start_val + (rand * 1000).round(4)
|
514
|
-
{
|
515
|
-
'gte' => start_val,
|
516
|
-
'lte' => end_val
|
517
|
-
}
|
518
|
-
end
|
519
|
-
|
520
|
-
def self.generate_ip_range_value
|
521
|
-
# Generate IP range with proper ordering
|
522
|
-
# Generate a base IP and add a small range to it
|
523
|
-
base_ip = "#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
|
524
|
-
|
525
|
-
# Parse the last octet and create a small range
|
526
|
-
parts = base_ip.split('.')
|
527
|
-
last_octet = parts[3].to_i
|
528
|
-
start_last = [last_octet, 254].min
|
529
|
-
end_last = [start_last + rand(1..10), 254].min
|
530
|
-
|
531
|
-
start_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{start_last}"
|
532
|
-
end_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{end_last}"
|
533
|
-
|
534
|
-
{
|
535
|
-
'gte' => start_ip,
|
536
|
-
'lte' => end_ip
|
537
|
-
}
|
103
|
+
def resolve_to_index_name(name)
|
104
|
+
# If it's an alias, get the actual index name it points to
|
105
|
+
if @client.alias_exists?(name)
|
106
|
+
indices = @client.get_alias_indices(name)
|
107
|
+
if indices.length != 1
|
108
|
+
raise "Alias '#{name}' points to multiple indices: #{indices.join(', ')}. Cannot determine which index to use for seeding."
|
109
|
+
end
|
110
|
+
return indices.first
|
111
|
+
end
|
112
|
+
|
113
|
+
# If it's already an index name, return it as-is
|
114
|
+
name
|
115
|
+
end
|
538
116
|
end
|
539
|
-
end
|
117
|
+
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: schema-tools
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.10
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Rich Kuzsma
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2025-10-
|
11
|
+
date: 2025-10-19 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rake
|
@@ -92,6 +92,10 @@ files:
|
|
92
92
|
- lib/schema_tools/seed.rb
|
93
93
|
- lib/schema_tools/settings_diff.rb
|
94
94
|
- lib/schema_tools/settings_filter.rb
|
95
|
+
- lib/seeder/base_doc_seeder.rb
|
96
|
+
- lib/seeder/custom_doc_seeder.rb
|
97
|
+
- lib/seeder/mappings_doc_seeder.rb
|
98
|
+
- lib/seeder/sample_doc_seeder.rb
|
95
99
|
- lib/seeder/seeder.rb
|
96
100
|
- lib/tasks/schema.rake
|
97
101
|
- lib/tasks/test.rake
|