RubyGems - schema-tools - Versions diffs - 1.0.9 → 1.0.10 - Mend

schema-tools 1.0.9 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/README.md +46 -1
data/lib/schema_tools/schema_files.rb +15 -0
data/lib/schema_tools/seed.rb +34 -25
data/lib/seeder/base_doc_seeder.rb +7 -0
data/lib/seeder/custom_doc_seeder.rb +15 -0
data/lib/seeder/mappings_doc_seeder.rb +451 -0
data/lib/seeder/sample_doc_seeder.rb +20 -0
data/lib/seeder/seeder.rb +84 -506
metadata +6 -2

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2b738fb047f313fd9e18d008f02e3bfcbb49ba675fd59f927242340212e2d098
-  data.tar.gz: fd89c552170b2b080bb0327963ac47fe42da36a2aaa859aaf28a150f65e61367
+  metadata.gz: ace88e188ea3e99453282c5b8a4f3160aabd50a1791b44adbbbbaa96ec2d9801
+  data.tar.gz: c4cb7b090362308b92c9e8f857110d7d104eef7907649825669fc46cc34f567b
 SHA512:
-  metadata.gz: 9553ce52859e437d88a4680af4234822b96f3c97a5518c7ba6ffea37b75b81c9336a4167fa1add2e824bf2aaf452aec382970d9c134139d56b3aee1288f4347a
-  data.tar.gz: b11e364c73bb62b6e7f9d49d44740fdeda4fe11483d6310b196595124c2835f716a0c1639346a56d60dc89c5bdaa7a651c490931fb6cf01b613a6757e0dd65ab
+  metadata.gz: 9d69414e92b731a19f47c98cb97a0cb8c6f2994be31f61930d7fcab7cc60272bd4c5af2790cbdd5eae7061a737eef744f7ab643d58c0647dfb3b166ad1f8d2b3
+  data.tar.gz: 2441ceb2f6d920aaf4e33b1116978f014e093bd433d4dc2ee3206b79d06c5dd43353de449f3fb7cc35a941a104eacdd8497e4601f246e4d2db5eaae79bb4a03b

data/README.md CHANGED Viewed

@@ -134,10 +134,55 @@ schemas/users
 Each schema folder name matches the name of an alias.
-## Other settings and tasks
+## Seed sample data
 Use `rake schema:seed` to seed an index with sample documents that conform to your schema.
+The seeder can generate sample docs for an index 3 ways:
+1. (Default) Mappings-based seeder
+The seeder generates random data that conforms to the index's mappings.
+2. Sample-based seeder
+Add a `sample_docs.json` file in the schema folder with example docs to randomly select from when seeding:
+```json
+{
+  "hits": [
+    {
+      "_source": {
+        "title": "Foo",
+        "desc": "Bar"
+      }
+    },
+    ...
+  ]
+}
+```
+3. Custom document seeder
+Add a `doc_seeder.rb` file in the schema folder with a class DocSeeder
+```ruby
+# schema:seed invokes this class when seeding test data for this index
+class DocSeeder
+  def initialize(index_or_alias_name) end
+  def generate_document
+    return {
+      'title' => 'Foo',
+      'desc' => 'Bar'
+    }
+  end
+end
+```
+The seeder first looks for a Custom document seeder. If none found, it falls back to a Sample seeder. If no sample documents found, it falls back to a Mappings seeder.
+## Other settings and tasks
 Use `DRYRUN` to simulate but not apply any POST/PUT/DELETE operations to your index:
 ```

data/lib/schema_tools/schema_files.rb CHANGED Viewed

@@ -23,6 +23,21 @@ module SchemaTools
       File.exist?(script_path) ? File.read(script_path) : nil
     end
+    def self.get_sample_docs(alias_name)
+      sample_docs_path = File.join(Config.schemas_path, alias_name, 'sample_docs.json')
+      return nil unless File.exist?(sample_docs_path)
+      JSON.parse(File.read(sample_docs_path))
+    end
+    def self.get_doc_seeder_class(alias_name)
+      seeder_path = File.join(Config.schemas_path, alias_name, 'doc_seeder.rb')
+      return nil unless File.exist?(seeder_path)
+      require(File.expand_path(seeder_path))
+      return DocSeeder
+    end
     def self.discover_all_schemas
       return [] unless Dir.exist?(Config.schemas_path)

data/lib/schema_tools/seed.rb CHANGED Viewed

@@ -1,21 +1,43 @@
 module SchemaTools
   def self.seed(client:)
-    # List available indices (connection already validated during client initialization)
+    # List available indices and aliases (connection already validated during client initialization)
     puts "Connecting to #{Config.connection_url}..."
+    aliases = client.list_aliases
     indices = client.list_indices
-    if indices.empty?
-      puts "No indices found in the cluster."
+    single_aliases = aliases.select { |alias_name, indices| indices.length == 1 && !alias_name.start_with?('.') }
+    unaliased_indices = indices.reject { |index| aliases.values.flatten.include?(index) || index.start_with?('.') || client.index_closed?(index) }
+    # Create a combined list with sequential numbering
+    options = []
+    if single_aliases.empty? && unaliased_indices.empty?
+      puts "No indices or aliases found in the cluster."
       puts "Please create an index first."
       exit 0
     end
-    puts "Available indices:"
-    indices.each_with_index do |index_name, index|
-      puts "#{index + 1}. #{index_name}"
+    puts "Available indices and aliases:"
+    # Show aliases first
+    if single_aliases.any?
+      single_aliases.each do |alias_name, indices|
+        option_number = options.length + 1
+        options << { type: :alias, name: alias_name, index: indices.first }
+        puts "#{option_number}. #{alias_name} -> #{indices.first}"
+      end
+    end
+    # Show unaliased indices
+    if unaliased_indices.any?
+      unaliased_indices.each do |index_name|
+        option_number = options.length + 1
+        options << { type: :index, name: index_name, index: index_name }
+        puts "#{option_number}. #{index_name}"
+      end
     end
-    puts "\nPlease select an index by number (1-#{indices.length}):"
+    puts "\nPlease select an index or alias by number (1-#{options.length}):"
     selection_input = STDIN.gets&.chomp
     if selection_input.nil?
       puts "No input provided. Exiting."
@@ -23,24 +45,13 @@ module SchemaTools
     end
     selection = selection_input.to_i
-    if selection < 1 || selection > indices.length
+    if selection < 1 || selection > options.length
       puts "Invalid selection. Please run the task again and select a valid number."
       exit 1
     end
-    selected_index = indices[selection - 1]
-    puts "Selected index: #{selected_index}"
-    # Fetch the mappings for the selected index
-    puts "Fetching mappings for #{selected_index}..."
-    mappings = client.get_index_mappings(selected_index)
-    if mappings.nil?
-      puts "Failed to fetch mappings for #{selected_index}"
-      exit 1
-    end
-    puts "Mappings fetched successfully."
+    selected_option = options[selection - 1]
+    puts "Selected #{selected_option[:type]}: #{selected_option[:name]}"
     # Prompt user for number of documents to seed
     puts "\nHow many documents would you like to seed?"
@@ -56,9 +67,7 @@ module SchemaTools
       exit 1
     end
-    puts "Seeding #{num_docs} documents from #{selected_index}..."
-    # Call the seeding function
-    Seed.seed_data(num_docs, mappings, client, selected_index)
+    seeder = Seeder::Seeder.new(index_or_alias_name: selected_option[:name], client: client)
+    seeder.seed(num_docs: num_docs, batch_size: 5)
   end
 end

data/lib/seeder/base_doc_seeder.rb ADDED Viewed

@@ -0,0 +1,7 @@
+module SchemaTools::Seeder
+  class BaseDocSeeder
+    def generate_document
+      raise NotImplementedError, "Subclasses must implement #generate_document"
+    end
+  end
+end

data/lib/seeder/custom_doc_seeder.rb ADDED Viewed

@@ -0,0 +1,15 @@
+module SchemaTools::Seeder
+  # To add a custom document seeder for a schema, add a file called
+  # schemas/{alias_name}/doc_seeder.rb with a class DocSeeder that extends from CustomDocSeeder
+  class CustomDocSeeder < BaseDocSeeder
+    attr_reader :index_or_alias_name
+    def initialize(index_or_alias_name)
+      @index_or_alias_name = index_or_alias_name
+    end
+    def generate_document
+      raise NotImplementedError, "Subclasses must implement #generate_document"
+    end
+  end
+end

data/lib/seeder/mappings_doc_seeder.rb ADDED Viewed

@@ -0,0 +1,451 @@
+require 'securerandom'
+module SchemaTools::Seeder
+  # Create a seed document by generating random values of correct types for an index mappings
+  class MappingsDocSeeder < BaseDocSeeder
+    # mappings: OpenSearch/Elasticsearch index mappings
+    def initialize(mappings)
+      @mappings = mappings
+    end
+    def generate_document()
+      document = {}
+      (@mappings.dig('properties') || {}).each do |field_name, field_config|
+        value = self.class.generate_field_value(field_config)
+        # Skip fields that return nil (like alias fields)
+        document[field_name] = value unless value.nil?
+      end
+      document
+    end
+    def self.generate_field_value(field_config)
+      field_type = field_config['type']
+      case field_type
+      when 'text'
+        generate_text_value
+      when 'keyword'
+        generate_keyword_value
+      when 'long', 'integer'
+        generate_integer_value
+      when 'short'
+        generate_short_value
+      when 'float', 'double'
+        generate_float_value
+      when 'boolean'
+        generate_boolean_value
+      when 'date'
+        generate_date_value(field_config['format'])
+      when 'object'
+        generate_object_value(field_config['properties'])
+      when 'nested'
+        generate_nested_value(field_config['properties'])
+      when 'rank_features'
+        generate_rank_features_value
+      when 'completion'
+        generate_completion_value
+      when 'search_as_you_type'
+        generate_search_as_you_type_value
+      when 'token_count'
+        generate_token_count_value
+      when 'alias'
+        # Skip alias fields - they point to other fields
+        nil
+      when 'byte'
+        generate_byte_value
+      when 'half_float'
+        generate_half_float_value
+      when 'scaled_float'
+        generate_scaled_float_value
+      when 'unsigned_long'
+        generate_unsigned_long_value
+      when 'date_nanos'
+        generate_date_nanos_value
+      when 'wildcard'
+        generate_wildcard_value
+      when 'constant_keyword'
+        generate_constant_keyword_value
+      when 'geo_shape'
+        generate_geo_shape_value
+      when 'date_range'
+        generate_date_range_value
+      when 'integer_range'
+        generate_integer_range_value
+      when 'float_range'
+        generate_float_range_value
+      when 'long_range'
+        generate_long_range_value
+      when 'double_range'
+        generate_double_range_value
+      when 'ip_range'
+        generate_ip_range_value
+      when 'geo_point'
+        generate_geo_point_value
+      when 'ip'
+        generate_ip_value
+      when 'binary'
+        generate_binary_value
+      else
+        # Default to keyword for unknown types
+        generate_keyword_value
+      end
+    end
+    def self.generate_text_value
+      # Generate a paragraph of 10-50 words
+      word_count = rand(10..50)
+      word_count.times.map { dictionary_words.sample }.join(' ')
+    end
+    def self.generate_keyword_value
+      # Generate a short phrase or single word
+      case rand(1..4)
+      when 1
+        dictionary_words.sample
+      when 2
+        "#{dictionary_words.sample}_#{rand(1000..9999)}"
+      when 3
+        "#{dictionary_words.sample} #{dictionary_words.sample}"
+      when 4
+        "#{dictionary_words.sample}-#{dictionary_words.sample}"
+      end
+    end
+    def self.generate_integer_value
+      # Generate reasonable integer values based on common use cases
+      case rand(1..5)
+      when 1
+        rand(1..1000) # Small positive numbers
+      when 2
+        rand(1_000_000..999_999_999) # Large IDs
+      when 3
+        rand(-100..100) # Small range including negatives
+      when 4
+        rand(1..100) # Percentages/scores
+      when 5
+        rand(1..365) # Days/periods
+      end
+    end
+    def self.generate_short_value
+      # Generate short values within Java short range (-32,768 to 32,767)
+      case rand(1..3)
+      when 1
+        rand(1..100) # Small positive numbers (common for ratings, counts)
+      when 2
+        rand(-100..100) # Small range including negatives
+      when 3
+        rand(1..10) # Very small numbers (ratings, flags)
+      end
+    end
+    def self.generate_float_value
+      # Generate decimal numbers
+      case rand(1..3)
+      when 1
+        (rand * 100).round(2) # 0-100 with 2 decimal places
+      when 2
+        (rand * 1000).round(4) # 0-1000 with 4 decimal places
+      when 3
+        (rand * 10 - 5).round(3) # -5 to 5 with 3 decimal places
+      end
+    end
+    def self.generate_boolean_value
+      [true, false].sample
+    end
+    def self.generate_date_value(format = nil)
+      # Generate a random date within the last year
+      start_time = Time.now - (365 * 24 * 60 * 60) # one year ago
+      random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
+      case format
+      when 'epoch_millis'
+        (random_time.to_f * 1000).to_i
+      when 'epoch_second'
+        random_time.to_i
+      when 'yyyy-MM-dd'
+        random_time.strftime('%Y-%m-%d')
+      when 'yyyy-MM-dd HH:mm:ss'
+        random_time.strftime('%Y-%m-%d %H:%M:%S')
+      when 'MM/dd/yyyy'
+        random_time.strftime('%m/%d/%Y')
+      when 'dd-MM-yyyy'
+        random_time.strftime('%d-%m-%Y')
+      else
+        # Default to ISO 8601 format
+        random_time.iso8601
+      end
+    end
+    def self.generate_object_value(properties)
+      return {} unless properties
+      object = {}
+      properties.each do |nested_field_name, nested_field_config|
+        # If a field has properties but no explicit type, it's an object
+        field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
+        parsed_config = {
+          'type' => field_type,
+          'properties' => nested_field_config['properties'],
+          'format' => nested_field_config['format']
+        }
+        object[nested_field_name] = generate_field_value(parsed_config)
+      end
+      object
+    end
+    def self.generate_nested_value(properties)
+      return [] unless properties
+      # Generate 1-3 nested objects
+      count = rand(1..3)
+      count.times.map do
+        object = {}
+        properties.each do |nested_field_name, nested_field_config|
+          # If a field has properties but no explicit type, it's an object
+          field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
+          parsed_config = {
+            type: field_type,
+            properties: nested_field_config['properties'],
+            format: nested_field_config['format']
+          }
+          object[nested_field_name] = generate_field_value(parsed_config)
+        end
+        object
+      end
+    end
+    def self.generate_rank_features_value
+      # Generate a rank_features object with random feature names and scores
+      # OpenSearch requires positive normal floats with minimum value of 1.17549435E-38
+      feature_count = rand(3..8)
+      features = {}
+      feature_count.times do
+        feature_name = "#{dictionary_words.sample}_#{rand(100..999)}"
+        # Generate values between 1.0e-30 and 1.0 to ensure positive normal floats
+        # Use a higher minimum to avoid floating-point precision issues
+        min_value = 1.0e-30  # Much higher than the OpenSearch minimum
+        value = rand(min_value..1.0).round(4)
+        # Ensure we never get exactly 0.0 due to floating-point precision
+        value = [value, 1.0e-30].max
+        features[feature_name] = value
+      end
+      features
+    end
+    def self.generate_geo_point_value
+      # Generate random latitude/longitude coordinates
+      {
+        lat: (rand * 180 - 90).round(6), # -90 to 90
+        lon: (rand * 360 - 180).round(6)  # -180 to 180
+      }
+    end
+    def self.generate_ip_value
+      # Generate random IP addresses
+      case rand(1..2)
+      when 1
+        # IPv4
+        "#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
+      when 2
+        # IPv6 (simplified)
+        "2001:db8::#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}"
+      end
+    end
+    def self.generate_binary_value
+      # Generate base64 encoded random data
+      require 'base64'
+      random_bytes = (0...32).map { rand(256) }.pack('C*')
+      Base64.encode64(random_bytes).strip
+    end
+    def self.generate_completion_value
+      # Generate completion suggestions
+      {
+        'input' => [dictionary_words.sample, "#{dictionary_words.sample} #{dictionary_words.sample}"],
+        'weight' => rand(1..100)
+      }
+    end
+    def self.generate_search_as_you_type_value
+      # Generate search-as-you-type text
+      "#{dictionary_words.sample} #{dictionary_words.sample} #{dictionary_words.sample}"
+    end
+    def self.generate_token_count_value
+      # Generate token count (integer representing number of tokens)
+      rand(1..50)
+    end
+    def self.generate_byte_value
+      # Generate byte values (-128 to 127)
+      rand(-128..127)
+    end
+    def self.generate_half_float_value
+      # Generate half-float values (smaller range than regular float)
+      (rand * 100 - 50).round(2)
+    end
+    def self.generate_scaled_float_value
+      # Generate scaled float values (multiplied by scaling factor)
+      (rand * 100).round(2)
+    end
+    def self.generate_unsigned_long_value
+      # Generate unsigned long values (0 to 2^64-1, but keep reasonable)
+      rand(0..999_999_999)
+    end
+    def self.generate_date_nanos_value
+      # Generate date with nanosecond precision
+      start_time = Time.now - (365 * 24 * 60 * 60)
+      random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
+      random_time.iso8601(9) # Include nanoseconds
+    end
+    def self.generate_wildcard_value
+      # Generate wildcard text (similar to keyword but optimized for wildcard queries)
+      "#{dictionary_words.sample}_#{rand(1000..9999)}"
+    end
+    def self.generate_constant_keyword_value
+      # Generate constant keyword (always the same value)
+      "constant_value"
+    end
+    def self.generate_geo_shape_value
+      # Generate simple geo shapes (point)
+      {
+        'type' => "point",
+        'coordinates' => [rand(-180.0..180.0).round(6), rand(-90.0..90.0).round(6)]
+      }
+    end
+    def self.generate_date_range_value
+      # Generate date range
+      start_date = Time.now - (365 * 24 * 60 * 60)
+      end_date = Time.now
+      {
+        'gte' => start_date.iso8601,
+        'lte' => end_date.iso8601
+      }
+    end
+    def self.generate_integer_range_value
+      # Generate integer range
+      start_val = rand(-1000..1000)
+      end_val = start_val + rand(1..1000)
+      {
+        'gte' => start_val,
+        'lte' => end_val
+      }
+    end
+    def self.generate_float_range_value
+      # Generate float range
+      start_val = (rand * 100 - 50).round(2)
+      end_val = start_val + (rand * 100).round(2)
+      {
+        'gte' => start_val,
+        'lte' => end_val
+      }
+    end
+    def self.generate_long_range_value
+      # Generate long range
+      start_val = rand(-1_000_000..1_000_000)
+      end_val = start_val + rand(1..1_000_000)
+      {
+        'gte' => start_val,
+        'lte' => end_val
+      }
+    end
+    def self.generate_double_range_value
+      # Generate double range
+      start_val = (rand * 1000 - 500).round(4)
+      end_val = start_val + (rand * 1000).round(4)
+      {
+        'gte' => start_val,
+        'lte' => end_val
+      }
+    end
+    def self.generate_ip_range_value
+      # Generate IP range with proper ordering
+      # Generate a base IP and add a small range to it
+      base_ip = "#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
+      # Parse the last octet and create a small range
+      parts = base_ip.split('.')
+      last_octet = parts[3].to_i
+      start_last = [last_octet, 254].min
+      end_last = [start_last + rand(1..10), 254].min
+      start_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{start_last}"
+      end_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{end_last}"
+      {
+        'gte' => start_ip,
+        'lte' => end_ip
+      }
+    end
+    def self.dictionary_words
+      @dictionary_words ||= begin
+        File.readlines('/usr/share/dict/words')
+            .map(&:chomp)
+            .select { |w| w.length.between?(3, 10) }
+      rescue Errno::ENOENT
+        %w[lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor
+          incididunt ut labore et dolore magna aliqua enim ad minim veniam quis nostrud
+          exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat duis aute
+          irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat
+          nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa
+          qui officia deserunt mollit anim id est laborum search engine data ruby
+          document index mapping schema elasticsearch opensearch cluster node shard
+          replica primary secondary analysis tokenizer filter analyzer query filter
+          aggregation pipeline script painless groovy mustache template kibana
+          logstash beats metricbeat filebeat packetbeat heartbeat auditbeat
+          functionbeat winlogbeat journalbeat apm agent apm server fleet agent
+          policy enrollment token integration package endpoint security detection
+          rule machine learning anomaly detection forecasting classification
+          regression clustering outlier detection natural language processing
+          vector search semantic search neural search transformer embedding
+          vector database similarity search recommendation system personalization
+          real-time streaming batch processing event sourcing cqrs microservices
+          distributed system scalability performance optimization monitoring
+          observability logging metrics tracing alerting notification dashboard
+          visualization reporting analytics business intelligence data science
+          machine learning artificial intelligence deep learning neural network
+          algorithm model training inference prediction classification regression
+          clustering dimensionality reduction feature engineering data preprocessing
+          validation testing deployment production staging development environment
+          configuration management version control continuous integration continuous
+          deployment devops infrastructure as code containerization orchestration
+          kubernetes docker swarm mesos nomad consul etcd zookeeper redis memcached
+          rabbitmq kafka pulsar nats jetstream grpc rest api graphql websocket
+          http https tls ssl certificate authentication authorization oauth jwt
+          saml ldap active directory kerberos rbac abac policy enforcement
+          compliance governance security audit vulnerability assessment penetration
+          testing threat modeling risk management incident response disaster recovery
+          backup restore high availability fault tolerance load balancing auto-scaling
+          horizontal scaling vertical scaling sharding partitioning replication
+          consistency eventual consistency strong consistency cap theorem acid
+          base distributed consensus raft paxos byzantine fault tolerance
+        ]
+      end
+    end
+  end
+end

data/lib/seeder/sample_doc_seeder.rb ADDED Viewed

@@ -0,0 +1,20 @@
+require 'securerandom'
+require 'active_support/all'
+module SchemaTools::Seeder
+  # Generate a document by choosing a document at random from an array of sample documents
+  #
+  # The seeder looks for sample docs in schemas/{alias_name}/sample_docs.json
+  # in the form: { "hits": [ { "_source": { "title": "Foo", "desc": "Bar" } }, ... ] }
+  class SampleDocSeeder < BaseDocSeeder
+    # sample_docs: Array of sample documents to pull from at random
+    def initialize(sample_docs)
+      @sample_docs = sample_docs['hits'].pluck('_source')
+    end
+    def generate_document
+      @sample_docs.sample
+    end
+  end
+end

data/lib/seeder/seeder.rb CHANGED Viewed

@@ -1,539 +1,117 @@
 require 'json'
 require 'time'
-module Seed
-  # Word list for generating realistic text content
-  WORD_LIST = %w[
-    lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor
-    incididunt ut labore et dolore magna aliqua enim ad minim veniam quis nostrud
-    exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat duis aute
-    irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat
-    nulla pariatur excepteur sint occaecat cupidatat non proident sunt in culpa
-    qui officia deserunt mollit anim id est laborum search engine data ruby
-    document index mapping schema elasticsearch opensearch cluster node shard
-    replica primary secondary analysis tokenizer filter analyzer query filter
-    aggregation pipeline script painless groovy mustache template kibana
-    logstash beats metricbeat filebeat packetbeat heartbeat auditbeat
-    functionbeat winlogbeat journalbeat apm agent apm server fleet agent
-    policy enrollment token integration package endpoint security detection
-    rule machine learning anomaly detection forecasting classification
-    regression clustering outlier detection natural language processing
-    vector search semantic search neural search transformer embedding
-    vector database similarity search recommendation system personalization
-    real-time streaming batch processing event sourcing cqrs microservices
-    distributed system scalability performance optimization monitoring
-    observability logging metrics tracing alerting notification dashboard
-    visualization reporting analytics business intelligence data science
-    machine learning artificial intelligence deep learning neural network
-    algorithm model training inference prediction classification regression
-    clustering dimensionality reduction feature engineering data preprocessing
-    validation testing deployment production staging development environment
-    configuration management version control continuous integration continuous
-    deployment devops infrastructure as code containerization orchestration
-    kubernetes docker swarm mesos nomad consul etcd zookeeper redis memcached
-    rabbitmq kafka pulsar nats jetstream grpc rest api graphql websocket
-    http https tls ssl certificate authentication authorization oauth jwt
-    saml ldap active directory kerberos rbac abac policy enforcement
-    compliance governance security audit vulnerability assessment penetration
-    testing threat modeling risk management incident response disaster recovery
-    backup restore high availability fault tolerance load balancing auto-scaling
-    horizontal scaling vertical scaling sharding partitioning replication
-    consistency eventual consistency strong consistency cap theorem acid
-    base distributed consensus raft paxos byzantine fault tolerance
-  ].freeze
-  def self.seed_data(num_docs, mappings_json, client, index_name)
-    puts "Seeding #{num_docs} documents to index: #{index_name}"
-    # Parse the mappings to understand the schema
-    schema = parse_mappings(mappings_json)
-    puts "Parsed schema with #{schema.keys.length} top-level fields"
-    # Generate documents in batches for efficiency
-    # Reduced batch size to avoid circuit breaker issues with large documents
-    batch_size = 25  # Reduced from 100 to 25 for large documents
-    total_batches = (num_docs.to_f / batch_size).ceil
-    (1..total_batches).each do |batch_num|
-      docs_in_batch = [batch_size, num_docs - (batch_num - 1) * batch_size].min
-      puts "Generating batch #{batch_num}/#{total_batches} (#{docs_in_batch} documents)..."
-      documents = generate_document_batch(docs_in_batch, schema)
-      puts "Indexing batch #{batch_num}..."
-      begin
-        response = client.bulk_index(documents, index_name)
-        # Check for errors in bulk response
-        if response['errors']
-          error_items = response['items'].select { |item| item.dig('index', 'status') >= 400 }
-          error_count = error_items.length
-          if error_count > 0
-            puts "WARN: #{error_count} documents failed to index in batch #{batch_num}"
-            # Print first few errors for debugging
-            error_items.first(3).each_with_index do |item, index|
-              error_info = item.dig('index', 'error')
-              if error_info
-                puts "  Error #{index + 1}: #{error_info['type']} - #{error_info['reason']}"
-                if error_info['caused_by']
-                  puts "    Caused by: #{error_info['caused_by']['type']} - #{error_info['caused_by']['reason']}"
-                end
-              end
-            end
-            if error_count > 3
-              puts "  ... and #{error_count - 3} more errors"
-            end
-          end
-        end
-        puts "Successfully indexed batch #{batch_num}"
-      rescue => e
-        if e.message.include?('circuit_breaking_exception') || e.message.include?('HTTP 429')
-          puts "ERROR: Circuit breaker triggered - OpenSearch cluster is out of memory"
-          puts "Consider:"
-          puts "  1. Reducing batch size further (currently #{batch_size})"
-          puts "  2. Increasing OpenSearch heap size"
-          puts "  3. Reducing document size/complexity"
-          puts "  4. Adding delays between batches"
-          puts ""
-          puts "Batch #{batch_num} failed: #{e.message}"
-          raise StandardError.new("Circuit breaker triggered - OpenSearch cluster is out of memory")
-        else
-          puts "Error indexing batch #{batch_num}: #{e.message}"
-          raise e
-        end
-      end
-      # Add a small delay between batches to help with memory pressure
-      sleep(0.1) if batch_num < total_batches
+module SchemaTools::Seeder
+  class Seeder
+    def initialize(index_or_alias_name:, client:)
+      @client = client
+      @index_or_alias_name = index_or_alias_name
+      @doc_seeder = initialize_doc_seeder
     end
-    puts "Successfully seeded #{num_docs} documents to #{index_name}"
-  end
-  private
+    def initialize_doc_seeder
+      custom_doc_seeder_class = SchemaTools::SchemaFiles.get_doc_seeder_class(@index_or_alias_name)
+      return custom_doc_seeder_class.new(@index_or_alias_name) if custom_doc_seeder_class
-  def self.parse_mappings(mappings_json)
-    # Extract the properties from the mappings
-    properties = mappings_json.dig('properties') || {}
-    parse_properties(properties)
-  end
+      sample_docs = SchemaTools::SchemaFiles.get_sample_docs(@index_or_alias_name)
+      return SampleDocSeeder.new(sample_docs) if sample_docs
-  def self.parse_properties(properties)
-    schema = {}
-    properties.each do |field_name, field_config|
-      # If a field has properties but no explicit type, it's an object
-      field_type = field_config['type'] || (field_config['properties'] ? 'object' : 'keyword')
-      schema[field_name] = {
-        type: field_type,
-        properties: field_config['properties'],
-        format: field_config['format']
-      }
-    end
-    schema
-  end
+      # Resolve alias to actual index name if needed
+      actual_index_name = resolve_to_index_name(@index_or_alias_name)
+      mappings = @client.get_index_mappings(actual_index_name)
+      return MappingsDocSeeder.new(mappings) if mappings
-  def self.generate_document_batch(count, schema)
-    count.times.map do
-      generate_document(schema)
+      raise "No custom document seeder, sample documents, or mappings found for #{@index_or_alias_name}"
     end
-  end
-  def self.generate_document(schema)
-    document = {}
-    schema.each do |field_name, field_config|
-      value = generate_field_value(field_config)
-      # Skip fields that return nil (like alias fields)
-      document[field_name] = value unless value.nil?
-    end
+    def seed(num_docs:, batch_size: 5)
+      puts "Seeding #{num_docs} in batches of #{batch_size} documents from #{@index_or_alias_name} using #{@doc_seeder.class.name}"
-    document
-  end
+      total_batches = (num_docs.to_f / batch_size).ceil
+      total_seeded_docs = 0
-  def self.generate_field_value(field_config)
-    field_type = field_config[:type]
-    case field_type
-    when 'text'
-      generate_text_value
-    when 'keyword'
-      generate_keyword_value
-    when 'long', 'integer'
-      generate_integer_value
-    when 'short'
-      generate_short_value
-    when 'float', 'double'
-      generate_float_value
-    when 'boolean'
-      generate_boolean_value
-    when 'date'
-      generate_date_value(field_config[:format])
-    when 'object'
-      generate_object_value(field_config[:properties])
-    when 'nested'
-      generate_nested_value(field_config[:properties])
-    when 'rank_features'
-      generate_rank_features_value
-    when 'completion'
-      generate_completion_value
-    when 'search_as_you_type'
-      generate_search_as_you_type_value
-    when 'token_count'
-      generate_token_count_value
-    when 'alias'
-      # Skip alias fields - they point to other fields
-      nil
-    when 'byte'
-      generate_byte_value
-    when 'half_float'
-      generate_half_float_value
-    when 'scaled_float'
-      generate_scaled_float_value
-    when 'unsigned_long'
-      generate_unsigned_long_value
-    when 'date_nanos'
-      generate_date_nanos_value
-    when 'wildcard'
-      generate_wildcard_value
-    when 'constant_keyword'
-      generate_constant_keyword_value
-    when 'geo_shape'
-      generate_geo_shape_value
-    when 'date_range'
-      generate_date_range_value
-    when 'integer_range'
-      generate_integer_range_value
-    when 'float_range'
-      generate_float_range_value
-    when 'long_range'
-      generate_long_range_value
-    when 'double_range'
-      generate_double_range_value
-    when 'ip_range'
-      generate_ip_range_value
-    when 'geo_point'
-      generate_geo_point_value
-    when 'ip'
-      generate_ip_value
-    when 'binary'
-      generate_binary_value
-    else
-      # Default to keyword for unknown types
-      generate_keyword_value
-    end
-  end
+      num_docs.times.each_slice(batch_size).with_index(1) do |batch_range, batch_num|
+        docs_in_batch = batch_range.size
-  def self.generate_text_value
-    # Generate a paragraph of 10-50 words
-    word_count = rand(10..50)
-    word_count.times.map { WORD_LIST.sample }.join(' ')
-  end
+        puts "Generating batch #{batch_num}/#{total_batches} (#{docs_in_batch} documents)..."
+        documents = Array.new(docs_in_batch) do
+          @doc_seeder.generate_document
+        end
-  def self.generate_keyword_value
-    # Generate a short phrase or single word
-    case rand(1..4)
-    when 1
-      WORD_LIST.sample
-    when 2
-      "#{WORD_LIST.sample}_#{rand(1000..9999)}"
-    when 3
-      "#{WORD_LIST.sample} #{WORD_LIST.sample}"
-    when 4
-      "#{WORD_LIST.sample}-#{WORD_LIST.sample}"
+        puts "Indexing batch #{batch_num}..."
+        response = bulk_index(documents)
+        seeded_docs = documents.length - print_errors(response)
+        total_seeded_docs += seeded_docs
+        puts "Indexed #{seeded_docs} documents for batch #{batch_num}" if seeded_docs
+        sleep(0.1) if batch_num < total_batches # small delay to help with memory pressure
+      rescue StandardError => e
+        puts "Batch #{batch_num} failed: #{e.message}"
+        handle_circuit_breaker_exception(e, batch_size)
+        raise e
+      end
+      puts "Seeded #{total_seeded_docs} documents to #{@index_or_alias_name}"
     end
-  end
-  def self.generate_integer_value
-    # Generate reasonable integer values based on common use cases
-    case rand(1..5)
-    when 1
-      rand(1..1000) # Small positive numbers
-    when 2
-      rand(1_000_000..999_999_999) # Large IDs
-    when 3
-      rand(-100..100) # Small range including negatives
-    when 4
-      rand(1..100) # Percentages/scores
-    when 5
-      rand(1..365) # Days/periods
+    def bulk_index(documents)
+      @client.bulk_index(documents, @index_or_alias_name)
     end
-  end
-  def self.generate_short_value
-    # Generate short values within Java short range (-32,768 to 32,767)
-    case rand(1..3)
-    when 1
-      rand(1..100) # Small positive numbers (common for ratings, counts)
-    when 2
-      rand(-100..100) # Small range including negatives
-    when 3
-      rand(1..10) # Very small numbers (ratings, flags)
+    def handle_circuit_breaker_exception(error, batch_size)
+      return unless error&.message&.match?(/circuit_breaking_exception|HTTP 429/)
+      puts 'ERROR: Circuit breaker triggered - OpenSearch cluster is out of memory'
+      puts 'Consider:'
+      puts "  1. Reducing batch size further (currently #{batch_size})"
+      puts '  2. Increasing OpenSearch heap size'
+      puts '  3. Reducing document size/complexity'
+      puts '  4. Adding delays between batches'
+      puts ''
+      raise StandardError, 'Circuit breaker triggered - OpenSearch cluster is out of memory'
     end
-  end
-  def self.generate_float_value
-    # Generate decimal numbers
-    case rand(1..3)
-    when 1
-      (rand * 100).round(2) # 0-100 with 2 decimal places
-    when 2
-      (rand * 1000).round(4) # 0-1000 with 4 decimal places
-    when 3
-      (rand * 10 - 5).round(3) # -5 to 5 with 3 decimal places
-    end
-  end
+    def print_errors(response)
+      return 0 unless response['errors']
-  def self.generate_boolean_value
-    [true, false].sample
-  end
+      error_items = response['items'].select { |item| item.dig('index', 'status') >= 400 }
+      error_count = error_items.length
+      return 0 unless error_count.positive?
-  def self.generate_date_value(format = nil)
-    # Generate a random date within the last year
-    start_time = Time.now - (365 * 24 * 60 * 60) # one year ago
-    random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
-    case format
-    when 'epoch_millis'
-      (random_time.to_f * 1000).to_i
-    when 'epoch_second'
-      random_time.to_i
-    when 'yyyy-MM-dd'
-      random_time.strftime('%Y-%m-%d')
-    when 'yyyy-MM-dd HH:mm:ss'
-      random_time.strftime('%Y-%m-%d %H:%M:%S')
-    when 'MM/dd/yyyy'
-      random_time.strftime('%m/%d/%Y')
-    when 'dd-MM-yyyy'
-      random_time.strftime('%d-%m-%Y')
-    else
-      # Default to ISO 8601 format
-      random_time.iso8601
-    end
-  end
+      puts "WARN: #{error_count} documents failed to index"
-  def self.generate_object_value(properties)
-    return {} unless properties
-    object = {}
-    properties.each do |nested_field_name, nested_field_config|
-      # If a field has properties but no explicit type, it's an object
-      field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
-      parsed_config = {
-        type: field_type,
-        properties: nested_field_config['properties'],
-        format: nested_field_config['format']
-      }
-      object[nested_field_name] = generate_field_value(parsed_config)
-    end
-    object
-  end
+      # Print first few errors for debugging
+      error_items.first(3).each_with_index do |item, index|
+        error_info = item.dig('index', 'error')
+        next unless error_info
-  def self.generate_nested_value(properties)
-    return [] unless properties
-    # Generate 1-3 nested objects
-    count = rand(1..3)
-    count.times.map do
-      object = {}
-      properties.each do |nested_field_name, nested_field_config|
-        # If a field has properties but no explicit type, it's an object
-        field_type = nested_field_config['type'] || (nested_field_config['properties'] ? 'object' : 'keyword')
-        parsed_config = {
-          type: field_type,
-          properties: nested_field_config['properties'],
-          format: nested_field_config['format']
-        }
-        object[nested_field_name] = generate_field_value(parsed_config)
+        print_error_item(error_info, index)
       end
-      object
-    end
-  end
-  def self.generate_rank_features_value
-    # Generate a rank_features object with random feature names and scores
-    # OpenSearch requires positive normal floats with minimum value of 1.17549435E-38
-    feature_count = rand(3..8)
-    features = {}
-    feature_count.times do
-      feature_name = "#{WORD_LIST.sample}_#{rand(100..999)}"
-      # Generate values between 1.0e-30 and 1.0 to ensure positive normal floats
-      # Use a higher minimum to avoid floating-point precision issues
-      min_value = 1.0e-30  # Much higher than the OpenSearch minimum
-      value = rand(min_value..1.0).round(4)
-      # Ensure we never get exactly 0.0 due to floating-point precision
-      value = [value, 1.0e-30].max
-      features[feature_name] = value
+      puts "  ... and #{error_count - 3} more errors" if error_count > 3
+      error_count
     end
-    features
-  end
-  def self.generate_geo_point_value
-    # Generate random latitude/longitude coordinates
-    {
-      lat: (rand * 180 - 90).round(6), # -90 to 90
-      lon: (rand * 360 - 180).round(6)  # -180 to 180
-    }
-  end
+    def print_error_item(error_info, index)
+      puts "  Error #{index + 1}: #{error_info['type']} - #{error_info['reason']}"
+      return unless error_info['caused_by']
-  def self.generate_ip_value
-    # Generate random IP addresses
-    case rand(1..2)
-    when 1
-      # IPv4
-      "#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
-    when 2
-      # IPv6 (simplified)
-      "2001:db8::#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}:#{rand(1000..9999)}"
+      puts "    Caused by: #{error_info['caused_by']['type']} - #{error_info['caused_by']['reason']}"
     end
-  end
-  def self.generate_binary_value
-    # Generate base64 encoded random data
-    require 'base64'
-    random_bytes = (0...32).map { rand(256) }.pack('C*')
-    Base64.encode64(random_bytes).strip
-  end
-  def self.generate_completion_value
-    # Generate completion suggestions
-    {
-      'input' => [WORD_LIST.sample, "#{WORD_LIST.sample} #{WORD_LIST.sample}"],
-      'weight' => rand(1..100)
-    }
-  end
-  def self.generate_search_as_you_type_value
-    # Generate search-as-you-type text
-    "#{WORD_LIST.sample} #{WORD_LIST.sample} #{WORD_LIST.sample}"
-  end
-  def self.generate_token_count_value
-    # Generate token count (integer representing number of tokens)
-    rand(1..50)
-  end
-  def self.generate_byte_value
-    # Generate byte values (-128 to 127)
-    rand(-128..127)
-  end
-  def self.generate_half_float_value
-    # Generate half-float values (smaller range than regular float)
-    (rand * 100 - 50).round(2)
-  end
-  def self.generate_scaled_float_value
-    # Generate scaled float values (multiplied by scaling factor)
-    (rand * 100).round(2)
-  end
-  def self.generate_unsigned_long_value
-    # Generate unsigned long values (0 to 2^64-1, but keep reasonable)
-    rand(0..999_999_999)
-  end
-  def self.generate_date_nanos_value
-    # Generate date with nanosecond precision
-    start_time = Time.now - (365 * 24 * 60 * 60)
-    random_time = Time.at(start_time.to_i + rand(Time.now.to_i - start_time.to_i))
-    random_time.iso8601(9) # Include nanoseconds
-  end
-  def self.generate_wildcard_value
-    # Generate wildcard text (similar to keyword but optimized for wildcard queries)
-    "#{WORD_LIST.sample}_#{rand(1000..9999)}"
-  end
-  def self.generate_constant_keyword_value
-    # Generate constant keyword (always the same value)
-    "constant_value"
-  end
-  def self.generate_geo_shape_value
-    # Generate simple geo shapes (point)
-    {
-      'type' => "point",
-      'coordinates' => [rand(-180.0..180.0).round(6), rand(-90.0..90.0).round(6)]
-    }
-  end
-  def self.generate_date_range_value
-    # Generate date range
-    start_date = Time.now - (365 * 24 * 60 * 60)
-    end_date = Time.now
-    {
-      'gte' => start_date.iso8601,
-      'lte' => end_date.iso8601
-    }
-  end
-  def self.generate_integer_range_value
-    # Generate integer range
-    start_val = rand(-1000..1000)
-    end_val = start_val + rand(1..1000)
-    {
-      'gte' => start_val,
-      'lte' => end_val
-    }
-  end
-  def self.generate_float_range_value
-    # Generate float range
-    start_val = (rand * 100 - 50).round(2)
-    end_val = start_val + (rand * 100).round(2)
-    {
-      'gte' => start_val,
-      'lte' => end_val
-    }
-  end
+    private
-  def self.generate_long_range_value
-    # Generate long range
-    start_val = rand(-1_000_000..1_000_000)
-    end_val = start_val + rand(1..1_000_000)
-    {
-      'gte' => start_val,
-      'lte' => end_val
-    }
-  end
-  def self.generate_double_range_value
-    # Generate double range
-    start_val = (rand * 1000 - 500).round(4)
-    end_val = start_val + (rand * 1000).round(4)
-    {
-      'gte' => start_val,
-      'lte' => end_val
-    }
-  end
-  def self.generate_ip_range_value
-    # Generate IP range with proper ordering
-    # Generate a base IP and add a small range to it
-    base_ip = "#{rand(1..254)}.#{rand(0..255)}.#{rand(0..255)}.#{rand(1..254)}"
-    # Parse the last octet and create a small range
-    parts = base_ip.split('.')
-    last_octet = parts[3].to_i
-    start_last = [last_octet, 254].min
-    end_last = [start_last + rand(1..10), 254].min
-    start_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{start_last}"
-    end_ip = "#{parts[0]}.#{parts[1]}.#{parts[2]}.#{end_last}"
-    {
-      'gte' => start_ip,
-      'lte' => end_ip
-    }
+    def resolve_to_index_name(name)
+      # If it's an alias, get the actual index name it points to
+      if @client.alias_exists?(name)
+        indices = @client.get_alias_indices(name)
+        if indices.length != 1
+          raise "Alias '#{name}' points to multiple indices: #{indices.join(', ')}. Cannot determine which index to use for seeding."
+        end
+        return indices.first
+      end
+      # If it's already an index name, return it as-is
+      name
+    end
   end
-end
+end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: schema-tools
 version: !ruby/object:Gem::Version
-  version: 1.0.9
+  version: 1.0.10
 platform: ruby
 authors:
 - Rich Kuzsma
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2025-10-17 00:00:00.000000000 Z
+date: 2025-10-19 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rake
@@ -92,6 +92,10 @@ files:
 - lib/schema_tools/seed.rb
 - lib/schema_tools/settings_diff.rb
 - lib/schema_tools/settings_filter.rb
+- lib/seeder/base_doc_seeder.rb
+- lib/seeder/custom_doc_seeder.rb
+- lib/seeder/mappings_doc_seeder.rb
+- lib/seeder/sample_doc_seeder.rb
 - lib/seeder/seeder.rb
 - lib/tasks/schema.rake
 - lib/tasks/test.rake